blob: d2b5faca9b5e8a6bc7dc6f7700bbf9e5f2b6fc5e [file] [log] [blame]
Simon Pilgrimf6fa1d02017-09-11 10:50:03 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
Simon Pilgrimd0ff65b2017-09-11 12:18:43 +00007
8;
9; General cases - packing of vector comparison to legal vector result types
10;
11
12define <16 x i8> @vselect_packss_v16i16(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2, <16 x i8> %a3) {
13; SSE-LABEL: vselect_packss_v16i16:
14; SSE: # BB#0:
15; SSE-NEXT: pcmpeqw %xmm3, %xmm1
16; SSE-NEXT: pcmpeqw %xmm2, %xmm0
17; SSE-NEXT: packsswb %xmm1, %xmm0
18; SSE-NEXT: pand %xmm0, %xmm4
19; SSE-NEXT: pandn %xmm5, %xmm0
20; SSE-NEXT: por %xmm4, %xmm0
21; SSE-NEXT: retq
22;
23; AVX1-LABEL: vselect_packss_v16i16:
24; AVX1: # BB#0:
25; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
26; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
27; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4
28; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
29; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0
30; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm1
31; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
32; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
33; AVX1-NEXT: vzeroupper
34; AVX1-NEXT: retq
35;
36; AVX2-LABEL: vselect_packss_v16i16:
37; AVX2: # BB#0:
38; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
39; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
40; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
41; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm1
42; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
43; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
44; AVX2-NEXT: vzeroupper
45; AVX2-NEXT: retq
46;
47; AVX512-LABEL: vselect_packss_v16i16:
48; AVX512: # BB#0:
49; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
50; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
51; AVX512-NEXT: vpmovdb %zmm0, %xmm0
52; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm1
53; AVX512-NEXT: vpandn %xmm3, %xmm0, %xmm0
54; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
55; AVX512-NEXT: vzeroupper
56; AVX512-NEXT: retq
57 %1 = icmp eq <16 x i16> %a0, %a1
58 %2 = sext <16 x i1> %1 to <16 x i8>
59 %3 = and <16 x i8> %2, %a2
60 %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
61 %5 = and <16 x i8> %4, %a3
62 %6 = or <16 x i8> %3, %5
63 ret <16 x i8> %6
64}
65
66define <16 x i8> @vselect_packss_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i8> %a2, <16 x i8> %a3) {
67; SSE-LABEL: vselect_packss_v16i32:
68; SSE: # BB#0:
69; SSE-NEXT: pcmpeqd %xmm7, %xmm3
70; SSE-NEXT: pcmpeqd %xmm6, %xmm2
71; SSE-NEXT: packsswb %xmm3, %xmm2
72; SSE-NEXT: pcmpeqd %xmm5, %xmm1
73; SSE-NEXT: pcmpeqd %xmm4, %xmm0
74; SSE-NEXT: packsswb %xmm1, %xmm0
75; SSE-NEXT: packsswb %xmm2, %xmm0
76; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
77; SSE-NEXT: pand %xmm0, %xmm1
78; SSE-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0
79; SSE-NEXT: por %xmm1, %xmm0
80; SSE-NEXT: retq
81;
82; AVX1-LABEL: vselect_packss_v16i32:
83; AVX1: # BB#0:
84; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
85; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
86; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
87; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
88; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1
89; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
90; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
91; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3
92; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
93; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
94; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
95; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1
96; AVX1-NEXT: vpandn %xmm5, %xmm0, %xmm0
97; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
98; AVX1-NEXT: vzeroupper
99; AVX1-NEXT: retq
100;
101; AVX2-LABEL: vselect_packss_v16i32:
102; AVX2: # BB#0:
103; AVX2-NEXT: vpcmpeqd %ymm3, %ymm1, %ymm1
104; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
105; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
106; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
107; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
108; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
109; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm1
110; AVX2-NEXT: vpandn %xmm5, %xmm0, %xmm0
111; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
112; AVX2-NEXT: vzeroupper
113; AVX2-NEXT: retq
114;
115; AVX512-LABEL: vselect_packss_v16i32:
116; AVX512: # BB#0:
117; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
118; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
119; AVX512-NEXT: vpmovdb %zmm0, %xmm0
120; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm1
121; AVX512-NEXT: vpandn %xmm3, %xmm0, %xmm0
122; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
123; AVX512-NEXT: vzeroupper
124; AVX512-NEXT: retq
125 %1 = icmp eq <16 x i32> %a0, %a1
126 %2 = sext <16 x i1> %1 to <16 x i8>
127 %3 = and <16 x i8> %2, %a2
128 %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
129 %5 = and <16 x i8> %4, %a3
130 %6 = or <16 x i8> %3, %5
131 ret <16 x i8> %6
132}
133
134define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8> %a2, <16 x i8> %a3) {
135; SSE2-LABEL: vselect_packss_v16i64:
136; SSE2: # BB#0:
137; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm7
138; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,0,3,2]
139; SSE2-NEXT: pand %xmm7, %xmm8
140; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm6
141; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,0,3,2]
142; SSE2-NEXT: pand %xmm6, %xmm7
143; SSE2-NEXT: packsswb %xmm8, %xmm7
144; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm5
145; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
146; SSE2-NEXT: pand %xmm5, %xmm6
147; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm4
148; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
149; SSE2-NEXT: pand %xmm4, %xmm5
150; SSE2-NEXT: packsswb %xmm6, %xmm5
151; SSE2-NEXT: packsswb %xmm7, %xmm5
152; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm3
153; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2]
154; SSE2-NEXT: pand %xmm3, %xmm4
155; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm2
156; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
157; SSE2-NEXT: pand %xmm2, %xmm3
158; SSE2-NEXT: packsswb %xmm4, %xmm3
159; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm1
160; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
161; SSE2-NEXT: pand %xmm1, %xmm2
162; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm0
163; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
164; SSE2-NEXT: pand %xmm0, %xmm1
165; SSE2-NEXT: packsswb %xmm2, %xmm1
166; SSE2-NEXT: packsswb %xmm3, %xmm1
167; SSE2-NEXT: packsswb %xmm5, %xmm1
168; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
169; SSE2-NEXT: pand %xmm1, %xmm0
170; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm1
171; SSE2-NEXT: por %xmm0, %xmm1
172; SSE2-NEXT: movdqa %xmm1, %xmm0
173; SSE2-NEXT: retq
174;
175; SSE42-LABEL: vselect_packss_v16i64:
176; SSE42: # BB#0:
177; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm7
178; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm6
179; SSE42-NEXT: packsswb %xmm7, %xmm6
180; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm5
181; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm4
182; SSE42-NEXT: packsswb %xmm5, %xmm4
183; SSE42-NEXT: packsswb %xmm6, %xmm4
184; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm3
185; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm2
186; SSE42-NEXT: packsswb %xmm3, %xmm2
187; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm1
188; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm0
189; SSE42-NEXT: packsswb %xmm1, %xmm0
190; SSE42-NEXT: packsswb %xmm2, %xmm0
191; SSE42-NEXT: packsswb %xmm4, %xmm0
192; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
193; SSE42-NEXT: pand %xmm0, %xmm1
194; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0
195; SSE42-NEXT: por %xmm1, %xmm0
196; SSE42-NEXT: retq
197;
198; AVX1-LABEL: vselect_packss_v16i64:
199; AVX1: # BB#0:
200; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
201; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
202; AVX1-NEXT: vpcmpeqq %xmm8, %xmm9, %xmm8
203; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3
204; AVX1-NEXT: vpacksswb %xmm8, %xmm3, %xmm8
205; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
206; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
207; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3
208; AVX1-NEXT: vpcmpeqq %xmm6, %xmm2, %xmm2
209; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
210; AVX1-NEXT: vpacksswb %xmm8, %xmm2, %xmm2
211; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
212; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
213; AVX1-NEXT: vpcmpeqq %xmm3, %xmm6, %xmm3
214; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1
215; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
216; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
217; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
218; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
219; AVX1-NEXT: vpcmpeqq %xmm4, %xmm0, %xmm0
220; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
221; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
222; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
223; AVX1-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
224; AVX1-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0
225; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
226; AVX1-NEXT: vzeroupper
227; AVX1-NEXT: retq
228;
229; AVX2-LABEL: vselect_packss_v16i64:
230; AVX2: # BB#0:
231; AVX2-NEXT: vpcmpeqq %ymm7, %ymm3, %ymm3
232; AVX2-NEXT: vpcmpeqq %ymm6, %ymm2, %ymm2
233; AVX2-NEXT: vpacksswb %ymm3, %ymm2, %ymm2
234; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
235; AVX2-NEXT: vpcmpeqq %ymm5, %ymm1, %ymm1
236; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0
237; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
238; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
239; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
240; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
241; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
242; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
243; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
244; AVX2-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0
245; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
246; AVX2-NEXT: vzeroupper
247; AVX2-NEXT: retq
248;
249; AVX512-LABEL: vselect_packss_v16i64:
250; AVX512: # BB#0:
251; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm6
252; AVX512-NEXT: vpextrq $1, %xmm6, %rcx
253; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm7
254; AVX512-NEXT: vpextrq $1, %xmm7, %rdx
255; AVX512-NEXT: xorl %eax, %eax
256; AVX512-NEXT: cmpq %rcx, %rdx
257; AVX512-NEXT: movq $-1, %rcx
258; AVX512-NEXT: movl $0, %edx
259; AVX512-NEXT: cmoveq %rcx, %rdx
260; AVX512-NEXT: vmovq %rdx, %xmm8
261; AVX512-NEXT: vmovq %xmm6, %rdx
262; AVX512-NEXT: vmovq %xmm7, %rsi
263; AVX512-NEXT: cmpq %rdx, %rsi
264; AVX512-NEXT: movl $0, %edx
265; AVX512-NEXT: cmoveq %rcx, %rdx
266; AVX512-NEXT: vmovq %rdx, %xmm6
267; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm6[0],xmm8[0]
268; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm7
269; AVX512-NEXT: vpextrq $1, %xmm7, %rdx
270; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm6
271; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
272; AVX512-NEXT: cmpq %rdx, %rsi
273; AVX512-NEXT: movl $0, %edx
274; AVX512-NEXT: cmoveq %rcx, %rdx
275; AVX512-NEXT: vmovq %rdx, %xmm9
276; AVX512-NEXT: vmovq %xmm7, %rdx
277; AVX512-NEXT: vmovq %xmm6, %rsi
278; AVX512-NEXT: cmpq %rdx, %rsi
279; AVX512-NEXT: movl $0, %edx
280; AVX512-NEXT: cmoveq %rcx, %rdx
281; AVX512-NEXT: vmovq %rdx, %xmm6
282; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0]
283; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm8
284; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm7
285; AVX512-NEXT: vpextrq $1, %xmm7, %rdx
286; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6
287; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
288; AVX512-NEXT: cmpq %rdx, %rsi
289; AVX512-NEXT: movl $0, %edx
290; AVX512-NEXT: cmoveq %rcx, %rdx
291; AVX512-NEXT: vmovq %rdx, %xmm9
292; AVX512-NEXT: vmovq %xmm7, %rdx
293; AVX512-NEXT: vmovq %xmm6, %rsi
294; AVX512-NEXT: cmpq %rdx, %rsi
295; AVX512-NEXT: movl $0, %edx
296; AVX512-NEXT: cmoveq %rcx, %rdx
297; AVX512-NEXT: vmovq %rdx, %xmm6
298; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0]
299; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
300; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
301; AVX512-NEXT: cmpq %rdx, %rsi
302; AVX512-NEXT: movl $0, %edx
303; AVX512-NEXT: cmoveq %rcx, %rdx
304; AVX512-NEXT: vmovq %rdx, %xmm7
305; AVX512-NEXT: vmovq %xmm2, %rdx
306; AVX512-NEXT: vmovq %xmm0, %rsi
307; AVX512-NEXT: cmpq %rdx, %rsi
308; AVX512-NEXT: movl $0, %edx
309; AVX512-NEXT: cmoveq %rcx, %rdx
310; AVX512-NEXT: vmovq %rdx, %xmm0
311; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
312; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
313; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
314; AVX512-NEXT: vpmovqd %zmm0, %ymm8
315; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2
316; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
317; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm6
318; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
319; AVX512-NEXT: cmpq %rdx, %rsi
320; AVX512-NEXT: movl $0, %edx
321; AVX512-NEXT: cmoveq %rcx, %rdx
322; AVX512-NEXT: vmovq %rdx, %xmm7
323; AVX512-NEXT: vmovq %xmm2, %rdx
324; AVX512-NEXT: vmovq %xmm6, %rsi
325; AVX512-NEXT: cmpq %rdx, %rsi
326; AVX512-NEXT: movl $0, %edx
327; AVX512-NEXT: cmoveq %rcx, %rdx
328; AVX512-NEXT: vmovq %rdx, %xmm2
329; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0]
330; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm6
331; AVX512-NEXT: vpextrq $1, %xmm6, %rdx
332; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm7
333; AVX512-NEXT: vpextrq $1, %xmm7, %rsi
334; AVX512-NEXT: cmpq %rdx, %rsi
335; AVX512-NEXT: movl $0, %edx
336; AVX512-NEXT: cmoveq %rcx, %rdx
337; AVX512-NEXT: vmovq %rdx, %xmm0
338; AVX512-NEXT: vmovq %xmm6, %rdx
339; AVX512-NEXT: vmovq %xmm7, %rsi
340; AVX512-NEXT: cmpq %rdx, %rsi
341; AVX512-NEXT: movl $0, %edx
342; AVX512-NEXT: cmoveq %rcx, %rdx
343; AVX512-NEXT: vmovq %rdx, %xmm6
344; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
345; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
346; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm0
347; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
348; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm6
349; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
350; AVX512-NEXT: cmpq %rdx, %rsi
351; AVX512-NEXT: movl $0, %edx
352; AVX512-NEXT: cmoveq %rcx, %rdx
353; AVX512-NEXT: vmovq %rdx, %xmm7
354; AVX512-NEXT: vmovq %xmm0, %rdx
355; AVX512-NEXT: vmovq %xmm6, %rsi
356; AVX512-NEXT: cmpq %rdx, %rsi
357; AVX512-NEXT: movl $0, %edx
358; AVX512-NEXT: cmoveq %rcx, %rdx
359; AVX512-NEXT: vmovq %rdx, %xmm0
360; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
361; AVX512-NEXT: vpextrq $1, %xmm3, %rdx
362; AVX512-NEXT: vpextrq $1, %xmm1, %rsi
363; AVX512-NEXT: cmpq %rdx, %rsi
364; AVX512-NEXT: movl $0, %edx
365; AVX512-NEXT: cmoveq %rcx, %rdx
366; AVX512-NEXT: vmovq %rdx, %xmm6
367; AVX512-NEXT: vmovq %xmm3, %rdx
368; AVX512-NEXT: vmovq %xmm1, %rsi
369; AVX512-NEXT: cmpq %rdx, %rsi
370; AVX512-NEXT: cmoveq %rcx, %rax
371; AVX512-NEXT: vmovq %rax, %xmm1
372; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0]
373; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
374; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
375; AVX512-NEXT: vpmovqd %zmm0, %ymm0
376; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0
377; AVX512-NEXT: vpmovdb %zmm0, %xmm0
378; AVX512-NEXT: vpand %xmm4, %xmm0, %xmm1
379; AVX512-NEXT: vpandn %xmm5, %xmm0, %xmm0
380; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
381; AVX512-NEXT: vzeroupper
382; AVX512-NEXT: retq
383 %1 = icmp eq <16 x i64> %a0, %a1
384 %2 = sext <16 x i1> %1 to <16 x i8>
385 %3 = and <16 x i8> %2, %a2
386 %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
387 %5 = and <16 x i8> %4, %a3
388 %6 = or <16 x i8> %3, %5
389 ret <16 x i8> %6
390}
391
392;
393; PACKSS case
394;
Simon Pilgrimf6fa1d02017-09-11 10:50:03 +0000395
396define <16 x i8> @vselect_packss(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2, <16 x i8> %a3) {
397; SSE-LABEL: vselect_packss:
398; SSE: # BB#0:
399; SSE-NEXT: pcmpeqw %xmm3, %xmm1
400; SSE-NEXT: pcmpeqw %xmm2, %xmm0
401; SSE-NEXT: packsswb %xmm1, %xmm0
402; SSE-NEXT: pand %xmm0, %xmm4
403; SSE-NEXT: pandn %xmm5, %xmm0
404; SSE-NEXT: por %xmm4, %xmm0
405; SSE-NEXT: retq
406;
407; AVX1-LABEL: vselect_packss:
408; AVX1: # BB#0:
409; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
410; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
411; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4
412; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
413; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0
414; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm1
415; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
416; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
417; AVX1-NEXT: vzeroupper
418; AVX1-NEXT: retq
419;
420; AVX2-LABEL: vselect_packss:
421; AVX2: # BB#0:
422; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
423; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
424; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
425; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm1
426; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
427; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
428; AVX2-NEXT: vzeroupper
429; AVX2-NEXT: retq
430;
431; AVX512-LABEL: vselect_packss:
432; AVX512: # BB#0:
433; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
434; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
435; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
436; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm1
437; AVX512-NEXT: vpandn %xmm3, %xmm0, %xmm0
438; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
439; AVX512-NEXT: vzeroupper
440; AVX512-NEXT: retq
441 %1 = icmp eq <16 x i16> %a0, %a1
442 %2 = sext <16 x i1> %1 to <16 x i16>
443 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
444 %4 = shufflevector <16 x i16> %2, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
445 %5 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %3, <8 x i16> %4)
446 %6 = and <16 x i8> %5, %a2
447 %7 = xor <16 x i8> %5, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
448 %8 = and <16 x i8> %7, %a3
449 %9 = or <16 x i8> %6, %8
450 ret <16 x i8> %9
451}
452declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)