blob: 0e7f9139919ac3a5cc8eb66f52c2d54d9827fff9 [file] [log] [blame]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
12
13;
14; PACKUS saturation truncation to vXi32
15;
16
17define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
18; SSE2-LABEL: trunc_packus_v4i64_v4i32:
19; SSE2: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +000020; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
Simon Pilgrim65ec9232018-01-26 14:58:50 +000021; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
22; SSE2-NEXT: movdqa %xmm0, %xmm3
23; SSE2-NEXT: pxor %xmm2, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +000024; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
25; SSE2-NEXT: movdqa %xmm5, %xmm6
26; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +000027; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
Simon Pilgrim0be55672018-02-11 10:52:37 +000028; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
29; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
30; SSE2-NEXT: pand %xmm7, %xmm4
31; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
32; SSE2-NEXT: por %xmm4, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +000033; SSE2-NEXT: pand %xmm3, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +000034; SSE2-NEXT: pandn %xmm8, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +000035; SSE2-NEXT: por %xmm0, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +000036; SSE2-NEXT: movdqa %xmm1, %xmm0
37; SSE2-NEXT: pxor %xmm2, %xmm0
38; SSE2-NEXT: movdqa %xmm5, %xmm4
39; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
40; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
41; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
42; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
43; SSE2-NEXT: pand %xmm6, %xmm0
44; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
45; SSE2-NEXT: por %xmm0, %xmm4
46; SSE2-NEXT: pand %xmm4, %xmm1
47; SSE2-NEXT: pandn %xmm8, %xmm4
48; SSE2-NEXT: por %xmm1, %xmm4
49; SSE2-NEXT: movdqa %xmm4, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +000050; SSE2-NEXT: pxor %xmm2, %xmm0
51; SSE2-NEXT: movdqa %xmm0, %xmm1
52; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
53; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
54; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +000055; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
56; SSE2-NEXT: pand %xmm5, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +000057; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +000058; SSE2-NEXT: por %xmm0, %xmm1
59; SSE2-NEXT: pand %xmm4, %xmm1
60; SSE2-NEXT: movdqa %xmm3, %xmm0
61; SSE2-NEXT: pxor %xmm2, %xmm0
62; SSE2-NEXT: movdqa %xmm0, %xmm4
63; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
64; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
65; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
66; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
67; SSE2-NEXT: pand %xmm5, %xmm2
68; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
69; SSE2-NEXT: por %xmm2, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +000070; SSE2-NEXT: pand %xmm3, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +000071; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
Simon Pilgrim65ec9232018-01-26 14:58:50 +000072; SSE2-NEXT: retq
73;
74; SSSE3-LABEL: trunc_packus_v4i64_v4i32:
75; SSSE3: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +000076; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
Simon Pilgrim65ec9232018-01-26 14:58:50 +000077; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
78; SSSE3-NEXT: movdqa %xmm0, %xmm3
79; SSSE3-NEXT: pxor %xmm2, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +000080; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
81; SSSE3-NEXT: movdqa %xmm5, %xmm6
82; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +000083; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
Simon Pilgrim0be55672018-02-11 10:52:37 +000084; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
85; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
86; SSSE3-NEXT: pand %xmm7, %xmm4
87; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
88; SSSE3-NEXT: por %xmm4, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +000089; SSSE3-NEXT: pand %xmm3, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +000090; SSSE3-NEXT: pandn %xmm8, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +000091; SSSE3-NEXT: por %xmm0, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +000092; SSSE3-NEXT: movdqa %xmm1, %xmm0
93; SSSE3-NEXT: pxor %xmm2, %xmm0
94; SSSE3-NEXT: movdqa %xmm5, %xmm4
95; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
96; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
97; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
98; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
99; SSSE3-NEXT: pand %xmm6, %xmm0
100; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
101; SSSE3-NEXT: por %xmm0, %xmm4
102; SSSE3-NEXT: pand %xmm4, %xmm1
103; SSSE3-NEXT: pandn %xmm8, %xmm4
104; SSSE3-NEXT: por %xmm1, %xmm4
105; SSSE3-NEXT: movdqa %xmm4, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000106; SSSE3-NEXT: pxor %xmm2, %xmm0
107; SSSE3-NEXT: movdqa %xmm0, %xmm1
108; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
109; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
110; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +0000111; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
112; SSSE3-NEXT: pand %xmm5, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000113; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000114; SSSE3-NEXT: por %xmm0, %xmm1
115; SSSE3-NEXT: pand %xmm4, %xmm1
116; SSSE3-NEXT: movdqa %xmm3, %xmm0
117; SSSE3-NEXT: pxor %xmm2, %xmm0
118; SSSE3-NEXT: movdqa %xmm0, %xmm4
119; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
120; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
121; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
122; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
123; SSSE3-NEXT: pand %xmm5, %xmm2
124; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
125; SSSE3-NEXT: por %xmm2, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000126; SSSE3-NEXT: pand %xmm3, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +0000127; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000128; SSSE3-NEXT: retq
129;
130; SSE41-LABEL: trunc_packus_v4i64_v4i32:
131; SSE41: # %bb.0:
132; SSE41-NEXT: movdqa %xmm0, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +0000133; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295]
134; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
135; SSE41-NEXT: pxor %xmm8, %xmm0
136; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647]
137; SSE41-NEXT: movdqa %xmm6, %xmm5
138; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
139; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
140; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000141; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000142; SSE41-NEXT: pand %xmm7, %xmm3
143; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000144; SSE41-NEXT: por %xmm3, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +0000145; SSE41-NEXT: movapd %xmm4, %xmm5
146; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
147; SSE41-NEXT: movdqa %xmm1, %xmm0
148; SSE41-NEXT: pxor %xmm8, %xmm0
149; SSE41-NEXT: movdqa %xmm6, %xmm2
150; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000151; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000152; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
153; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
154; SSE41-NEXT: pand %xmm3, %xmm6
155; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
156; SSE41-NEXT: por %xmm6, %xmm0
157; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
158; SSE41-NEXT: xorpd %xmm1, %xmm1
159; SSE41-NEXT: movapd %xmm4, %xmm0
160; SSE41-NEXT: xorpd %xmm8, %xmm0
161; SSE41-NEXT: movapd %xmm0, %xmm2
162; SSE41-NEXT: pcmpgtd %xmm8, %xmm2
163; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
164; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
165; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
166; SSE41-NEXT: pand %xmm3, %xmm6
167; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
168; SSE41-NEXT: por %xmm6, %xmm0
169; SSE41-NEXT: pxor %xmm2, %xmm2
170; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
171; SSE41-NEXT: movapd %xmm5, %xmm0
172; SSE41-NEXT: xorpd %xmm8, %xmm0
173; SSE41-NEXT: movapd %xmm0, %xmm3
174; SSE41-NEXT: pcmpgtd %xmm8, %xmm3
175; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
176; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
177; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
178; SSE41-NEXT: pand %xmm4, %xmm6
179; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
180; SSE41-NEXT: por %xmm6, %xmm0
181; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
182; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
183; SSE41-NEXT: movaps %xmm1, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000184; SSE41-NEXT: retq
185;
186; AVX1-LABEL: trunc_packus_v4i64_v4i32:
187; AVX1: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +0000188; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
189; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
190; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4294967295,4294967295]
191; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
192; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
193; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
194; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000195; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
196; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
197; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
198; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
199; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
200; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
201; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
202; AVX1-NEXT: vzeroupper
203; AVX1-NEXT: retq
204;
205; AVX2-SLOW-LABEL: trunc_packus_v4i64_v4i32:
206; AVX2-SLOW: # %bb.0:
207; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
208; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
209; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
210; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
211; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
212; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0
213; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
214; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
Puyan Lotfi43e94b12018-01-31 22:04:26 +0000215; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000216; AVX2-SLOW-NEXT: vzeroupper
217; AVX2-SLOW-NEXT: retq
218;
219; AVX2-FAST-LABEL: trunc_packus_v4i64_v4i32:
220; AVX2-FAST: # %bb.0:
221; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
222; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
223; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
224; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
225; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
226; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0
227; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
228; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
Puyan Lotfi43e94b12018-01-31 22:04:26 +0000229; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000230; AVX2-FAST-NEXT: vzeroupper
231; AVX2-FAST-NEXT: retq
232;
233; AVX512F-LABEL: trunc_packus_v4i64_v4i32:
234; AVX512F: # %bb.0:
Puyan Lotfi43e94b12018-01-31 22:04:26 +0000235; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000236; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
237; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
238; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
239; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
240; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
Puyan Lotfi43e94b12018-01-31 22:04:26 +0000241; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000242; AVX512F-NEXT: vzeroupper
243; AVX512F-NEXT: retq
244;
245; AVX512VL-LABEL: trunc_packus_v4i64_v4i32:
246; AVX512VL: # %bb.0:
247; AVX512VL-NEXT: vpminsq {{.*}}(%rip){1to4}, %ymm0, %ymm0
248; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
249; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
250; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
251; AVX512VL-NEXT: vzeroupper
252; AVX512VL-NEXT: retq
253;
254; AVX512BW-LABEL: trunc_packus_v4i64_v4i32:
255; AVX512BW: # %bb.0:
Puyan Lotfi43e94b12018-01-31 22:04:26 +0000256; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000257; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
258; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
259; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
260; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
261; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
Puyan Lotfi43e94b12018-01-31 22:04:26 +0000262; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000263; AVX512BW-NEXT: vzeroupper
264; AVX512BW-NEXT: retq
265;
266; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32:
267; AVX512BWVL: # %bb.0:
268; AVX512BWVL-NEXT: vpminsq {{.*}}(%rip){1to4}, %ymm0, %ymm0
269; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
270; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
271; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
272; AVX512BWVL-NEXT: vzeroupper
273; AVX512BWVL-NEXT: retq
274 %1 = icmp slt <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
275 %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
276 %3 = icmp sgt <4 x i64> %2, zeroinitializer
277 %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer
278 %5 = trunc <4 x i64> %4 to <4 x i32>
279 ret <4 x i32> %5
280}
281
282
283define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
284; SSE2-LABEL: trunc_packus_v8i64_v8i32:
285; SSE2: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +0000286; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
287; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
288; SSE2-NEXT: movdqa %xmm0, %xmm5
289; SSE2-NEXT: pxor %xmm10, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000290; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
291; SSE2-NEXT: movdqa %xmm9, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000292; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
293; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
294; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
Simon Pilgrim0be55672018-02-11 10:52:37 +0000295; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
296; SSE2-NEXT: pand %xmm7, %xmm4
297; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000298; SSE2-NEXT: por %xmm4, %xmm5
Simon Pilgrim0be55672018-02-11 10:52:37 +0000299; SSE2-NEXT: pand %xmm5, %xmm0
300; SSE2-NEXT: pandn %xmm8, %xmm5
301; SSE2-NEXT: por %xmm0, %xmm5
302; SSE2-NEXT: movdqa %xmm1, %xmm0
303; SSE2-NEXT: pxor %xmm10, %xmm0
304; SSE2-NEXT: movdqa %xmm9, %xmm4
305; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
306; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
307; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
308; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
309; SSE2-NEXT: pand %xmm6, %xmm7
310; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
311; SSE2-NEXT: por %xmm7, %xmm0
312; SSE2-NEXT: pand %xmm0, %xmm1
313; SSE2-NEXT: pandn %xmm8, %xmm0
314; SSE2-NEXT: por %xmm1, %xmm0
315; SSE2-NEXT: movdqa %xmm2, %xmm1
316; SSE2-NEXT: pxor %xmm10, %xmm1
317; SSE2-NEXT: movdqa %xmm9, %xmm4
318; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
319; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
320; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000321; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000322; SSE2-NEXT: pand %xmm6, %xmm1
323; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
324; SSE2-NEXT: por %xmm1, %xmm6
325; SSE2-NEXT: pand %xmm6, %xmm2
326; SSE2-NEXT: pandn %xmm8, %xmm6
327; SSE2-NEXT: por %xmm2, %xmm6
328; SSE2-NEXT: movdqa %xmm3, %xmm1
329; SSE2-NEXT: pxor %xmm10, %xmm1
330; SSE2-NEXT: movdqa %xmm9, %xmm2
331; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
332; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
333; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
334; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
335; SSE2-NEXT: pand %xmm4, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000336; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
337; SSE2-NEXT: por %xmm1, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +0000338; SSE2-NEXT: pand %xmm2, %xmm3
339; SSE2-NEXT: pandn %xmm8, %xmm2
340; SSE2-NEXT: por %xmm3, %xmm2
341; SSE2-NEXT: movdqa %xmm2, %xmm1
342; SSE2-NEXT: pxor %xmm10, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000343; SSE2-NEXT: movdqa %xmm1, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +0000344; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
345; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
346; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
347; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
348; SSE2-NEXT: pand %xmm4, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000349; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000350; SSE2-NEXT: por %xmm1, %xmm3
351; SSE2-NEXT: pand %xmm2, %xmm3
352; SSE2-NEXT: movdqa %xmm6, %xmm1
353; SSE2-NEXT: pxor %xmm10, %xmm1
354; SSE2-NEXT: movdqa %xmm1, %xmm2
355; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
356; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
357; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
358; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
359; SSE2-NEXT: pand %xmm4, %xmm7
360; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
361; SSE2-NEXT: por %xmm7, %xmm1
362; SSE2-NEXT: pand %xmm6, %xmm1
363; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
364; SSE2-NEXT: movdqa %xmm0, %xmm2
365; SSE2-NEXT: pxor %xmm10, %xmm2
366; SSE2-NEXT: movdqa %xmm2, %xmm3
367; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
368; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
369; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
370; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
371; SSE2-NEXT: pand %xmm4, %xmm2
372; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
373; SSE2-NEXT: por %xmm2, %xmm3
374; SSE2-NEXT: pand %xmm0, %xmm3
375; SSE2-NEXT: movdqa %xmm5, %xmm0
376; SSE2-NEXT: pxor %xmm10, %xmm0
377; SSE2-NEXT: movdqa %xmm0, %xmm2
378; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
379; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
380; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
381; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
382; SSE2-NEXT: pand %xmm4, %xmm6
383; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
384; SSE2-NEXT: por %xmm6, %xmm0
385; SSE2-NEXT: pand %xmm5, %xmm0
386; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000387; SSE2-NEXT: retq
388;
389; SSSE3-LABEL: trunc_packus_v8i64_v8i32:
390; SSSE3: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +0000391; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
392; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
393; SSSE3-NEXT: movdqa %xmm0, %xmm5
394; SSSE3-NEXT: pxor %xmm10, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000395; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
396; SSSE3-NEXT: movdqa %xmm9, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000397; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
398; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
399; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
Simon Pilgrim0be55672018-02-11 10:52:37 +0000400; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
401; SSSE3-NEXT: pand %xmm7, %xmm4
402; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000403; SSSE3-NEXT: por %xmm4, %xmm5
Simon Pilgrim0be55672018-02-11 10:52:37 +0000404; SSSE3-NEXT: pand %xmm5, %xmm0
405; SSSE3-NEXT: pandn %xmm8, %xmm5
406; SSSE3-NEXT: por %xmm0, %xmm5
407; SSSE3-NEXT: movdqa %xmm1, %xmm0
408; SSSE3-NEXT: pxor %xmm10, %xmm0
409; SSSE3-NEXT: movdqa %xmm9, %xmm4
410; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
411; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
412; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
413; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
414; SSSE3-NEXT: pand %xmm6, %xmm7
415; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
416; SSSE3-NEXT: por %xmm7, %xmm0
417; SSSE3-NEXT: pand %xmm0, %xmm1
418; SSSE3-NEXT: pandn %xmm8, %xmm0
419; SSSE3-NEXT: por %xmm1, %xmm0
420; SSSE3-NEXT: movdqa %xmm2, %xmm1
421; SSSE3-NEXT: pxor %xmm10, %xmm1
422; SSSE3-NEXT: movdqa %xmm9, %xmm4
423; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
424; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
425; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000426; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000427; SSSE3-NEXT: pand %xmm6, %xmm1
428; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
429; SSSE3-NEXT: por %xmm1, %xmm6
430; SSSE3-NEXT: pand %xmm6, %xmm2
431; SSSE3-NEXT: pandn %xmm8, %xmm6
432; SSSE3-NEXT: por %xmm2, %xmm6
433; SSSE3-NEXT: movdqa %xmm3, %xmm1
434; SSSE3-NEXT: pxor %xmm10, %xmm1
435; SSSE3-NEXT: movdqa %xmm9, %xmm2
436; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
437; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
438; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
439; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
440; SSSE3-NEXT: pand %xmm4, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000441; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
442; SSSE3-NEXT: por %xmm1, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +0000443; SSSE3-NEXT: pand %xmm2, %xmm3
444; SSSE3-NEXT: pandn %xmm8, %xmm2
445; SSSE3-NEXT: por %xmm3, %xmm2
446; SSSE3-NEXT: movdqa %xmm2, %xmm1
447; SSSE3-NEXT: pxor %xmm10, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000448; SSSE3-NEXT: movdqa %xmm1, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +0000449; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
450; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
451; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
452; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
453; SSSE3-NEXT: pand %xmm4, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000454; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000455; SSSE3-NEXT: por %xmm1, %xmm3
456; SSSE3-NEXT: pand %xmm2, %xmm3
457; SSSE3-NEXT: movdqa %xmm6, %xmm1
458; SSSE3-NEXT: pxor %xmm10, %xmm1
459; SSSE3-NEXT: movdqa %xmm1, %xmm2
460; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
461; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
462; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
463; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
464; SSSE3-NEXT: pand %xmm4, %xmm7
465; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
466; SSSE3-NEXT: por %xmm7, %xmm1
467; SSSE3-NEXT: pand %xmm6, %xmm1
468; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
469; SSSE3-NEXT: movdqa %xmm0, %xmm2
470; SSSE3-NEXT: pxor %xmm10, %xmm2
471; SSSE3-NEXT: movdqa %xmm2, %xmm3
472; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
473; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
474; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
475; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
476; SSSE3-NEXT: pand %xmm4, %xmm2
477; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
478; SSSE3-NEXT: por %xmm2, %xmm3
479; SSSE3-NEXT: pand %xmm0, %xmm3
480; SSSE3-NEXT: movdqa %xmm5, %xmm0
481; SSSE3-NEXT: pxor %xmm10, %xmm0
482; SSSE3-NEXT: movdqa %xmm0, %xmm2
483; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
484; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
485; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
486; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
487; SSSE3-NEXT: pand %xmm4, %xmm6
488; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
489; SSSE3-NEXT: por %xmm6, %xmm0
490; SSSE3-NEXT: pand %xmm5, %xmm0
491; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000492; SSSE3-NEXT: retq
493;
494; SSE41-LABEL: trunc_packus_v8i64_v8i32:
495; SSE41: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +0000496; SSE41-NEXT: movdqa %xmm0, %xmm4
497; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295]
498; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
499; SSE41-NEXT: pxor %xmm10, %xmm0
500; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [2147483647,2147483647]
501; SSE41-NEXT: movdqa %xmm11, %xmm6
502; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
503; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
504; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
505; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
506; SSE41-NEXT: pand %xmm8, %xmm5
507; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
508; SSE41-NEXT: por %xmm5, %xmm0
509; SSE41-NEXT: movapd %xmm7, %xmm8
510; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
511; SSE41-NEXT: movdqa %xmm1, %xmm0
512; SSE41-NEXT: pxor %xmm10, %xmm0
513; SSE41-NEXT: movdqa %xmm11, %xmm4
514; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
515; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000516; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
517; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000518; SSE41-NEXT: pand %xmm5, %xmm6
519; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
520; SSE41-NEXT: por %xmm6, %xmm0
521; SSE41-NEXT: movapd %xmm7, %xmm9
522; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9
523; SSE41-NEXT: movdqa %xmm2, %xmm0
524; SSE41-NEXT: pxor %xmm10, %xmm0
525; SSE41-NEXT: movdqa %xmm11, %xmm1
526; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
527; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
528; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
529; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
530; SSE41-NEXT: pand %xmm4, %xmm5
531; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
532; SSE41-NEXT: por %xmm5, %xmm0
533; SSE41-NEXT: movapd %xmm7, %xmm4
534; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
535; SSE41-NEXT: movdqa %xmm3, %xmm0
536; SSE41-NEXT: pxor %xmm10, %xmm0
537; SSE41-NEXT: movdqa %xmm11, %xmm1
538; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
539; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
540; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
541; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
542; SSE41-NEXT: pand %xmm2, %xmm5
543; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
544; SSE41-NEXT: por %xmm5, %xmm0
545; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
546; SSE41-NEXT: pxor %xmm2, %xmm2
547; SSE41-NEXT: movapd %xmm7, %xmm0
548; SSE41-NEXT: xorpd %xmm10, %xmm0
549; SSE41-NEXT: movapd %xmm0, %xmm1
550; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
551; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
552; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
553; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
554; SSE41-NEXT: pand %xmm3, %xmm5
555; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
556; SSE41-NEXT: por %xmm5, %xmm0
557; SSE41-NEXT: pxor %xmm3, %xmm3
558; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm3
559; SSE41-NEXT: movapd %xmm4, %xmm0
560; SSE41-NEXT: xorpd %xmm10, %xmm0
561; SSE41-NEXT: movapd %xmm0, %xmm1
562; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
563; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
564; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
565; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
566; SSE41-NEXT: pand %xmm5, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000567; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
568; SSE41-NEXT: por %xmm6, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +0000569; SSE41-NEXT: pxor %xmm1, %xmm1
570; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
571; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
572; SSE41-NEXT: movapd %xmm9, %xmm0
573; SSE41-NEXT: xorpd %xmm10, %xmm0
574; SSE41-NEXT: movapd %xmm0, %xmm3
575; SSE41-NEXT: pcmpgtd %xmm10, %xmm3
576; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
577; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
578; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
579; SSE41-NEXT: pand %xmm4, %xmm5
580; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
581; SSE41-NEXT: por %xmm5, %xmm0
582; SSE41-NEXT: pxor %xmm3, %xmm3
583; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3
584; SSE41-NEXT: movapd %xmm8, %xmm0
585; SSE41-NEXT: xorpd %xmm10, %xmm0
586; SSE41-NEXT: movapd %xmm0, %xmm4
587; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
588; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
589; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
590; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
591; SSE41-NEXT: pand %xmm5, %xmm6
592; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
593; SSE41-NEXT: por %xmm6, %xmm0
594; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
595; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
596; SSE41-NEXT: movaps %xmm2, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000597; SSE41-NEXT: retq
598;
599; AVX1-LABEL: trunc_packus_v8i64_v8i32:
600; AVX1: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +0000601; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
602; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
603; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4294967295,4294967295]
604; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
605; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5
606; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
607; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
608; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
609; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
610; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm4
611; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
612; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000613; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2
614; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
615; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
616; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
617; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm6
618; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
619; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2
620; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
621; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1
622; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
623; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm2
624; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
625; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
626; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
627; AVX1-NEXT: retq
628;
629; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32:
630; AVX2-SLOW: # %bb.0:
631; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
632; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000633; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
Simon Pilgrim0be55672018-02-11 10:52:37 +0000634; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
635; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000636; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +0000637; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
638; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1
639; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
640; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000641; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
642; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
643; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
644; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
645; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
646; AVX2-SLOW-NEXT: retq
647;
648; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32:
649; AVX2-FAST: # %bb.0:
650; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
651; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000652; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
Simon Pilgrim0be55672018-02-11 10:52:37 +0000653; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
654; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000655; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +0000656; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
657; AVX2-FAST-NEXT: vpand %ymm1, %ymm3, %ymm1
658; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
659; AVX2-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000660; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
661; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
662; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
663; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
664; AVX2-FAST-NEXT: retq
665;
666; AVX512-LABEL: trunc_packus_v8i64_v8i32:
667; AVX512: # %bb.0:
668; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
669; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
670; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
671; AVX512-NEXT: vpmovqd %zmm0, %ymm0
672; AVX512-NEXT: retq
673 %1 = icmp slt <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
674 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
675 %3 = icmp sgt <8 x i64> %2, zeroinitializer
676 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
677 %5 = trunc <8 x i64> %4 to <8 x i32>
678 ret <8 x i32> %5
679}
680
681;
682; PACKUS saturation truncation to vXi16
683;
684
685define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) {
686; SSE2-LABEL: trunc_packus_v8i64_v8i16:
687; SSE2: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +0000688; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
689; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
690; SSE2-NEXT: movdqa %xmm1, %xmm5
691; SSE2-NEXT: pxor %xmm10, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000692; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183]
693; SSE2-NEXT: movdqa %xmm9, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000694; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
695; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
696; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
Simon Pilgrim0be55672018-02-11 10:52:37 +0000697; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
698; SSE2-NEXT: pand %xmm7, %xmm4
699; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
700; SSE2-NEXT: por %xmm4, %xmm5
701; SSE2-NEXT: pand %xmm5, %xmm1
702; SSE2-NEXT: pandn %xmm8, %xmm5
703; SSE2-NEXT: por %xmm1, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000704; SSE2-NEXT: movdqa %xmm0, %xmm1
Simon Pilgrim0be55672018-02-11 10:52:37 +0000705; SSE2-NEXT: pxor %xmm10, %xmm1
706; SSE2-NEXT: movdqa %xmm9, %xmm4
707; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000708; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000709; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
710; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
711; SSE2-NEXT: pand %xmm6, %xmm7
712; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
713; SSE2-NEXT: por %xmm7, %xmm1
714; SSE2-NEXT: pand %xmm1, %xmm0
715; SSE2-NEXT: pandn %xmm8, %xmm1
716; SSE2-NEXT: por %xmm0, %xmm1
717; SSE2-NEXT: movdqa %xmm3, %xmm0
718; SSE2-NEXT: pxor %xmm10, %xmm0
719; SSE2-NEXT: movdqa %xmm9, %xmm4
720; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
721; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
722; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
723; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
724; SSE2-NEXT: pand %xmm6, %xmm0
725; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
726; SSE2-NEXT: por %xmm0, %xmm6
727; SSE2-NEXT: pand %xmm6, %xmm3
728; SSE2-NEXT: pandn %xmm8, %xmm6
729; SSE2-NEXT: por %xmm3, %xmm6
730; SSE2-NEXT: movdqa %xmm2, %xmm0
731; SSE2-NEXT: pxor %xmm10, %xmm0
732; SSE2-NEXT: movdqa %xmm9, %xmm3
733; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
734; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
735; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
736; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
737; SSE2-NEXT: pand %xmm4, %xmm0
738; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
739; SSE2-NEXT: por %xmm0, %xmm3
740; SSE2-NEXT: pand %xmm3, %xmm2
741; SSE2-NEXT: pandn %xmm8, %xmm3
742; SSE2-NEXT: por %xmm2, %xmm3
743; SSE2-NEXT: movdqa %xmm3, %xmm0
744; SSE2-NEXT: pxor %xmm10, %xmm0
745; SSE2-NEXT: movdqa %xmm0, %xmm2
746; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
747; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
748; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
749; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
750; SSE2-NEXT: pand %xmm4, %xmm7
751; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
752; SSE2-NEXT: por %xmm7, %xmm0
753; SSE2-NEXT: pand %xmm3, %xmm0
754; SSE2-NEXT: movdqa %xmm6, %xmm2
755; SSE2-NEXT: pxor %xmm10, %xmm2
756; SSE2-NEXT: movdqa %xmm2, %xmm3
757; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
758; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
759; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
760; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
761; SSE2-NEXT: pand %xmm4, %xmm7
762; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
763; SSE2-NEXT: por %xmm7, %xmm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000764; SSE2-NEXT: pand %xmm6, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +0000765; SSE2-NEXT: movdqa %xmm1, %xmm3
766; SSE2-NEXT: pxor %xmm10, %xmm3
767; SSE2-NEXT: movdqa %xmm3, %xmm4
768; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
769; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
770; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
771; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
772; SSE2-NEXT: pand %xmm6, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000773; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000774; SSE2-NEXT: por %xmm3, %xmm4
775; SSE2-NEXT: pand %xmm1, %xmm4
776; SSE2-NEXT: movdqa %xmm5, %xmm1
777; SSE2-NEXT: pxor %xmm10, %xmm1
778; SSE2-NEXT: movdqa %xmm1, %xmm3
779; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
780; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
781; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
782; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
783; SSE2-NEXT: pand %xmm6, %xmm1
784; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
785; SSE2-NEXT: por %xmm1, %xmm3
786; SSE2-NEXT: pand %xmm5, %xmm3
787; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000788; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000789; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
790; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
791; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
792; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
793; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
794; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000795; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000796; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
797; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000798; SSE2-NEXT: retq
799;
800; SSSE3-LABEL: trunc_packus_v8i64_v8i16:
801; SSSE3: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +0000802; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
803; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
804; SSSE3-NEXT: movdqa %xmm1, %xmm5
805; SSSE3-NEXT: pxor %xmm10, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000806; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183]
807; SSSE3-NEXT: movdqa %xmm9, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000808; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
809; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
810; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
Simon Pilgrim0be55672018-02-11 10:52:37 +0000811; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
812; SSSE3-NEXT: pand %xmm7, %xmm4
813; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
814; SSSE3-NEXT: por %xmm4, %xmm5
815; SSSE3-NEXT: pand %xmm5, %xmm1
816; SSSE3-NEXT: pandn %xmm8, %xmm5
817; SSSE3-NEXT: por %xmm1, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000818; SSSE3-NEXT: movdqa %xmm0, %xmm1
Simon Pilgrim0be55672018-02-11 10:52:37 +0000819; SSSE3-NEXT: pxor %xmm10, %xmm1
820; SSSE3-NEXT: movdqa %xmm9, %xmm4
821; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000822; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000823; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
824; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
825; SSSE3-NEXT: pand %xmm6, %xmm7
826; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
827; SSSE3-NEXT: por %xmm7, %xmm1
828; SSSE3-NEXT: pand %xmm1, %xmm0
829; SSSE3-NEXT: pandn %xmm8, %xmm1
830; SSSE3-NEXT: por %xmm0, %xmm1
831; SSSE3-NEXT: movdqa %xmm3, %xmm0
832; SSSE3-NEXT: pxor %xmm10, %xmm0
833; SSSE3-NEXT: movdqa %xmm9, %xmm4
834; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
835; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
836; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
837; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
838; SSSE3-NEXT: pand %xmm6, %xmm0
839; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
840; SSSE3-NEXT: por %xmm0, %xmm6
841; SSSE3-NEXT: pand %xmm6, %xmm3
842; SSSE3-NEXT: pandn %xmm8, %xmm6
843; SSSE3-NEXT: por %xmm3, %xmm6
844; SSSE3-NEXT: movdqa %xmm2, %xmm0
845; SSSE3-NEXT: pxor %xmm10, %xmm0
846; SSSE3-NEXT: movdqa %xmm9, %xmm3
847; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
848; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
849; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
850; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
851; SSSE3-NEXT: pand %xmm4, %xmm0
852; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
853; SSSE3-NEXT: por %xmm0, %xmm3
854; SSSE3-NEXT: pand %xmm3, %xmm2
855; SSSE3-NEXT: pandn %xmm8, %xmm3
856; SSSE3-NEXT: por %xmm2, %xmm3
857; SSSE3-NEXT: movdqa %xmm3, %xmm0
858; SSSE3-NEXT: pxor %xmm10, %xmm0
859; SSSE3-NEXT: movdqa %xmm0, %xmm2
860; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
861; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
862; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
863; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
864; SSSE3-NEXT: pand %xmm4, %xmm7
865; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
866; SSSE3-NEXT: por %xmm7, %xmm0
867; SSSE3-NEXT: pand %xmm3, %xmm0
868; SSSE3-NEXT: movdqa %xmm6, %xmm2
869; SSSE3-NEXT: pxor %xmm10, %xmm2
870; SSSE3-NEXT: movdqa %xmm2, %xmm3
871; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
872; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
873; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
874; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
875; SSSE3-NEXT: pand %xmm4, %xmm7
876; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
877; SSSE3-NEXT: por %xmm7, %xmm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000878; SSSE3-NEXT: pand %xmm6, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +0000879; SSSE3-NEXT: movdqa %xmm1, %xmm3
880; SSSE3-NEXT: pxor %xmm10, %xmm3
881; SSSE3-NEXT: movdqa %xmm3, %xmm4
882; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
883; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
884; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
885; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
886; SSSE3-NEXT: pand %xmm6, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000887; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000888; SSSE3-NEXT: por %xmm3, %xmm4
889; SSSE3-NEXT: pand %xmm1, %xmm4
890; SSSE3-NEXT: movdqa %xmm5, %xmm1
891; SSSE3-NEXT: pxor %xmm10, %xmm1
892; SSSE3-NEXT: movdqa %xmm1, %xmm3
893; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
894; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
895; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
896; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
897; SSSE3-NEXT: pand %xmm6, %xmm1
898; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
899; SSSE3-NEXT: por %xmm1, %xmm3
900; SSSE3-NEXT: pand %xmm5, %xmm3
901; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000902; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000903; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
904; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
905; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
906; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
907; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
908; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000909; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
Simon Pilgrim0be55672018-02-11 10:52:37 +0000910; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
911; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000912; SSSE3-NEXT: retq
913;
914; SSE41-LABEL: trunc_packus_v8i64_v8i16:
915; SSE41: # %bb.0:
916; SSE41-NEXT: movdqa %xmm0, %xmm8
Simon Pilgrim0be55672018-02-11 10:52:37 +0000917; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535]
918; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
919; SSE41-NEXT: movdqa %xmm3, %xmm0
920; SSE41-NEXT: pxor %xmm9, %xmm0
921; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183]
922; SSE41-NEXT: movdqa %xmm10, %xmm6
923; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
924; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2]
925; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
926; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
927; SSE41-NEXT: pand %xmm5, %xmm4
928; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
929; SSE41-NEXT: por %xmm4, %xmm0
930; SSE41-NEXT: movapd %xmm7, %xmm6
931; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
932; SSE41-NEXT: movdqa %xmm2, %xmm0
933; SSE41-NEXT: pxor %xmm9, %xmm0
934; SSE41-NEXT: movdqa %xmm10, %xmm3
935; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
936; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
937; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
938; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000939; SSE41-NEXT: pand %xmm4, %xmm5
Simon Pilgrim0be55672018-02-11 10:52:37 +0000940; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
941; SSE41-NEXT: por %xmm5, %xmm0
942; SSE41-NEXT: movapd %xmm7, %xmm3
943; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
944; SSE41-NEXT: movdqa %xmm1, %xmm0
945; SSE41-NEXT: pxor %xmm9, %xmm0
946; SSE41-NEXT: movdqa %xmm10, %xmm2
947; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
948; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
949; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
950; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
951; SSE41-NEXT: pand %xmm4, %xmm5
952; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
953; SSE41-NEXT: por %xmm5, %xmm0
954; SSE41-NEXT: movapd %xmm7, %xmm2
955; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
956; SSE41-NEXT: movdqa %xmm8, %xmm0
957; SSE41-NEXT: pxor %xmm9, %xmm0
958; SSE41-NEXT: movdqa %xmm10, %xmm1
959; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
960; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
961; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
962; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
963; SSE41-NEXT: pand %xmm4, %xmm5
964; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
965; SSE41-NEXT: por %xmm5, %xmm0
966; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
967; SSE41-NEXT: movapd %xmm7, %xmm0
968; SSE41-NEXT: xorpd %xmm9, %xmm0
969; SSE41-NEXT: movapd %xmm0, %xmm1
970; SSE41-NEXT: pcmpgtd %xmm9, %xmm1
971; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
972; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
973; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
974; SSE41-NEXT: pand %xmm4, %xmm5
975; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
976; SSE41-NEXT: por %xmm5, %xmm0
977; SSE41-NEXT: pxor %xmm8, %xmm8
978; SSE41-NEXT: pxor %xmm1, %xmm1
979; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
980; SSE41-NEXT: movapd %xmm2, %xmm0
981; SSE41-NEXT: xorpd %xmm9, %xmm0
982; SSE41-NEXT: movapd %xmm0, %xmm5
983; SSE41-NEXT: pcmpgtd %xmm9, %xmm5
984; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
985; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
986; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
987; SSE41-NEXT: pand %xmm7, %xmm4
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000988; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
989; SSE41-NEXT: por %xmm4, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +0000990; SSE41-NEXT: pxor %xmm7, %xmm7
991; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm7
992; SSE41-NEXT: movapd %xmm3, %xmm0
993; SSE41-NEXT: xorpd %xmm9, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +0000994; SSE41-NEXT: movapd %xmm0, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +0000995; SSE41-NEXT: pcmpgtd %xmm9, %xmm2
996; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
997; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
998; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
999; SSE41-NEXT: pand %xmm4, %xmm5
1000; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
1001; SSE41-NEXT: por %xmm5, %xmm0
1002; SSE41-NEXT: pxor %xmm2, %xmm2
1003; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
1004; SSE41-NEXT: movapd %xmm6, %xmm0
1005; SSE41-NEXT: xorpd %xmm9, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001006; SSE41-NEXT: movapd %xmm0, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +00001007; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001008; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
Simon Pilgrim0be55672018-02-11 10:52:37 +00001009; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
1010; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
1011; SSE41-NEXT: pand %xmm4, %xmm5
1012; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
1013; SSE41-NEXT: por %xmm5, %xmm0
1014; SSE41-NEXT: pxor %xmm3, %xmm3
1015; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3
1016; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1,2,3],xmm3[4],xmm8[5,6,7]
1017; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2,3],xmm2[4],xmm8[5,6,7]
1018; SSE41-NEXT: packusdw %xmm3, %xmm2
1019; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1,2,3],xmm7[4],xmm8[5,6,7]
1020; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1,2,3],xmm1[4],xmm8[5,6,7]
1021; SSE41-NEXT: packusdw %xmm7, %xmm1
1022; SSE41-NEXT: packusdw %xmm2, %xmm1
1023; SSE41-NEXT: movdqa %xmm1, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001024; SSE41-NEXT: retq
1025;
1026; AVX1-LABEL: trunc_packus_v8i64_v8i16:
1027; AVX1: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +00001028; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [65535,65535,65535,65535]
1029; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1030; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535]
1031; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
1032; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5
1033; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
1034; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
1035; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1036; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
1037; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm4
1038; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
1039; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001040; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2
1041; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm8
1042; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
1043; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
1044; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm6
1045; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
1046; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm3
1047; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1048; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7]
1049; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1
1050; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1051; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1052; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
1053; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7]
1054; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0
1055; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1056; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1057; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1058; AVX1-NEXT: vzeroupper
1059; AVX1-NEXT: retq
1060;
1061; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i16:
1062; AVX2-SLOW: # %bb.0:
1063; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535]
1064; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001065; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00001066; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
1067; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001068; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00001069; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
1070; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1
1071; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
1072; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001073; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1074; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1075; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1076; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1077; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1078; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1079; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
Puyan Lotfi43e94b12018-01-31 22:04:26 +00001080; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001081; AVX2-SLOW-NEXT: vzeroupper
1082; AVX2-SLOW-NEXT: retq
1083;
1084; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i16:
1085; AVX2-FAST: # %bb.0:
1086; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535]
1087; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001088; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00001089; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
1090; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001091; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00001092; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
1093; AVX2-FAST-NEXT: vpand %ymm1, %ymm3, %ymm1
1094; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
1095; AVX2-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001096; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1097; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
1098; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
1099; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1100; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1101; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
Puyan Lotfi43e94b12018-01-31 22:04:26 +00001102; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001103; AVX2-FAST-NEXT: vzeroupper
1104; AVX2-FAST-NEXT: retq
1105;
1106; AVX512-LABEL: trunc_packus_v8i64_v8i16:
1107; AVX512: # %bb.0:
1108; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
1109; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1110; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
1111; AVX512-NEXT: vpmovqw %zmm0, %xmm0
1112; AVX512-NEXT: vzeroupper
1113; AVX512-NEXT: retq
1114 %1 = icmp slt <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
1115 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
1116 %3 = icmp sgt <8 x i64> %2, zeroinitializer
1117 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1118 %5 = trunc <8 x i64> %4 to <8 x i16>
1119 ret <8 x i16> %5
1120}
1121
1122define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) {
1123; SSE2-LABEL: trunc_packus_v8i32_v8i16:
1124; SSE2: # %bb.0:
1125; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1126; SSE2-NEXT: movdqa %xmm2, %xmm3
1127; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001128; SSE2-NEXT: pand %xmm3, %xmm1
1129; SSE2-NEXT: pandn %xmm2, %xmm3
1130; SSE2-NEXT: por %xmm1, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +00001131; SSE2-NEXT: movdqa %xmm2, %xmm1
1132; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
1133; SSE2-NEXT: pand %xmm1, %xmm0
1134; SSE2-NEXT: pandn %xmm2, %xmm1
1135; SSE2-NEXT: por %xmm0, %xmm1
1136; SSE2-NEXT: pxor %xmm2, %xmm2
1137; SSE2-NEXT: movdqa %xmm1, %xmm0
1138; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
1139; SSE2-NEXT: pand %xmm1, %xmm0
1140; SSE2-NEXT: movdqa %xmm3, %xmm1
1141; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
1142; SSE2-NEXT: pand %xmm3, %xmm1
1143; SSE2-NEXT: pslld $16, %xmm1
1144; SSE2-NEXT: psrad $16, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001145; SSE2-NEXT: pslld $16, %xmm0
1146; SSE2-NEXT: psrad $16, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00001147; SSE2-NEXT: packssdw %xmm1, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001148; SSE2-NEXT: retq
1149;
1150; SSSE3-LABEL: trunc_packus_v8i32_v8i16:
1151; SSSE3: # %bb.0:
1152; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1153; SSSE3-NEXT: movdqa %xmm2, %xmm3
1154; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001155; SSSE3-NEXT: pand %xmm3, %xmm1
1156; SSSE3-NEXT: pandn %xmm2, %xmm3
1157; SSSE3-NEXT: por %xmm1, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +00001158; SSSE3-NEXT: movdqa %xmm2, %xmm1
1159; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
1160; SSSE3-NEXT: pand %xmm1, %xmm0
1161; SSSE3-NEXT: pandn %xmm2, %xmm1
1162; SSSE3-NEXT: por %xmm0, %xmm1
1163; SSSE3-NEXT: pxor %xmm2, %xmm2
1164; SSSE3-NEXT: movdqa %xmm1, %xmm0
1165; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
1166; SSSE3-NEXT: pand %xmm1, %xmm0
1167; SSSE3-NEXT: movdqa %xmm3, %xmm1
1168; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
1169; SSSE3-NEXT: pand %xmm3, %xmm1
1170; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1171; SSSE3-NEXT: pshufb %xmm2, %xmm1
1172; SSSE3-NEXT: pshufb %xmm2, %xmm0
1173; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001174; SSSE3-NEXT: retq
1175;
1176; SSE41-LABEL: trunc_packus_v8i32_v8i16:
1177; SSE41: # %bb.0:
Simon Pilgrim86d15bf2018-02-14 14:14:29 +00001178; SSE41-NEXT: packusdw %xmm1, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001179; SSE41-NEXT: retq
1180;
1181; AVX1-LABEL: trunc_packus_v8i32_v8i16:
1182; AVX1: # %bb.0:
1183; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
Simon Pilgrimb4e789e2018-02-07 15:48:44 +00001184; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001185; AVX1-NEXT: vzeroupper
1186; AVX1-NEXT: retq
1187;
1188; AVX2-LABEL: trunc_packus_v8i32_v8i16:
1189; AVX2: # %bb.0:
Simon Pilgrimb4e789e2018-02-07 15:48:44 +00001190; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1191; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001192; AVX2-NEXT: vzeroupper
1193; AVX2-NEXT: retq
1194;
1195; AVX512F-LABEL: trunc_packus_v8i32_v8i16:
1196; AVX512F: # %bb.0:
1197; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
1198; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
1199; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
1200; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1201; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
Puyan Lotfi43e94b12018-01-31 22:04:26 +00001202; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001203; AVX512F-NEXT: vzeroupper
1204; AVX512F-NEXT: retq
1205;
1206; AVX512VL-LABEL: trunc_packus_v8i32_v8i16:
1207; AVX512VL: # %bb.0:
1208; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
1209; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1210; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1211; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
1212; AVX512VL-NEXT: vzeroupper
1213; AVX512VL-NEXT: retq
1214;
1215; AVX512BW-LABEL: trunc_packus_v8i32_v8i16:
1216; AVX512BW: # %bb.0:
1217; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
1218; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
1219; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
1220; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1221; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
Puyan Lotfi43e94b12018-01-31 22:04:26 +00001222; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001223; AVX512BW-NEXT: vzeroupper
1224; AVX512BW-NEXT: retq
1225;
1226; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16:
1227; AVX512BWVL: # %bb.0:
1228; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
1229; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1230; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1231; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
1232; AVX512BWVL-NEXT: vzeroupper
1233; AVX512BWVL-NEXT: retq
1234 %1 = icmp slt <8 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1235 %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1236 %3 = icmp sgt <8 x i32> %2, zeroinitializer
1237 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
1238 %5 = trunc <8 x i32> %4 to <8 x i16>
1239 ret <8 x i16> %5
1240}
1241
1242define <16 x i16> @trunc_packus_v16i32_v16i16(<16 x i32> %a0) {
1243; SSE2-LABEL: trunc_packus_v16i32_v16i16:
1244; SSE2: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +00001245; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
1246; SSE2-NEXT: movdqa %xmm6, %xmm4
1247; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
1248; SSE2-NEXT: pand %xmm4, %xmm1
1249; SSE2-NEXT: pandn %xmm6, %xmm4
1250; SSE2-NEXT: por %xmm1, %xmm4
1251; SSE2-NEXT: movdqa %xmm6, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001252; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001253; SSE2-NEXT: pand %xmm5, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00001254; SSE2-NEXT: pandn %xmm6, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001255; SSE2-NEXT: por %xmm0, %xmm5
Simon Pilgrim0be55672018-02-11 10:52:37 +00001256; SSE2-NEXT: movdqa %xmm6, %xmm0
1257; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
1258; SSE2-NEXT: pand %xmm0, %xmm3
1259; SSE2-NEXT: pandn %xmm6, %xmm0
1260; SSE2-NEXT: por %xmm3, %xmm0
1261; SSE2-NEXT: movdqa %xmm6, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001262; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +00001263; SSE2-NEXT: pand %xmm3, %xmm2
1264; SSE2-NEXT: pandn %xmm6, %xmm3
1265; SSE2-NEXT: por %xmm2, %xmm3
1266; SSE2-NEXT: pxor %xmm2, %xmm2
1267; SSE2-NEXT: movdqa %xmm3, %xmm1
1268; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
1269; SSE2-NEXT: pand %xmm3, %xmm1
1270; SSE2-NEXT: movdqa %xmm0, %xmm3
1271; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
1272; SSE2-NEXT: pand %xmm0, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001273; SSE2-NEXT: movdqa %xmm5, %xmm0
1274; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001275; SSE2-NEXT: pand %xmm5, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00001276; SSE2-NEXT: movdqa %xmm4, %xmm5
1277; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
1278; SSE2-NEXT: pand %xmm4, %xmm5
1279; SSE2-NEXT: pslld $16, %xmm5
1280; SSE2-NEXT: psrad $16, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001281; SSE2-NEXT: pslld $16, %xmm0
1282; SSE2-NEXT: psrad $16, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00001283; SSE2-NEXT: packssdw %xmm5, %xmm0
1284; SSE2-NEXT: pslld $16, %xmm3
1285; SSE2-NEXT: psrad $16, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001286; SSE2-NEXT: pslld $16, %xmm1
1287; SSE2-NEXT: psrad $16, %xmm1
Simon Pilgrim0be55672018-02-11 10:52:37 +00001288; SSE2-NEXT: packssdw %xmm3, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001289; SSE2-NEXT: retq
1290;
1291; SSSE3-LABEL: trunc_packus_v16i32_v16i16:
1292; SSSE3: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +00001293; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
1294; SSSE3-NEXT: movdqa %xmm6, %xmm4
1295; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
1296; SSSE3-NEXT: pand %xmm4, %xmm1
1297; SSSE3-NEXT: pandn %xmm6, %xmm4
1298; SSSE3-NEXT: por %xmm1, %xmm4
1299; SSSE3-NEXT: movdqa %xmm6, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001300; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001301; SSSE3-NEXT: pand %xmm5, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00001302; SSSE3-NEXT: pandn %xmm6, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001303; SSSE3-NEXT: por %xmm0, %xmm5
Simon Pilgrim0be55672018-02-11 10:52:37 +00001304; SSSE3-NEXT: movdqa %xmm6, %xmm0
1305; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
1306; SSSE3-NEXT: pand %xmm0, %xmm3
1307; SSSE3-NEXT: pandn %xmm6, %xmm0
1308; SSSE3-NEXT: por %xmm3, %xmm0
1309; SSSE3-NEXT: movdqa %xmm6, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001310; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +00001311; SSSE3-NEXT: pand %xmm3, %xmm2
1312; SSSE3-NEXT: pandn %xmm6, %xmm3
1313; SSSE3-NEXT: por %xmm2, %xmm3
1314; SSSE3-NEXT: pxor %xmm2, %xmm2
1315; SSSE3-NEXT: movdqa %xmm3, %xmm1
1316; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
1317; SSSE3-NEXT: pand %xmm3, %xmm1
1318; SSSE3-NEXT: movdqa %xmm0, %xmm3
1319; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
1320; SSSE3-NEXT: pand %xmm0, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001321; SSSE3-NEXT: movdqa %xmm5, %xmm0
1322; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001323; SSSE3-NEXT: pand %xmm5, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00001324; SSSE3-NEXT: movdqa %xmm4, %xmm5
1325; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
1326; SSSE3-NEXT: pand %xmm4, %xmm5
1327; SSSE3-NEXT: pslld $16, %xmm5
1328; SSSE3-NEXT: psrad $16, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001329; SSSE3-NEXT: pslld $16, %xmm0
1330; SSSE3-NEXT: psrad $16, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00001331; SSSE3-NEXT: packssdw %xmm5, %xmm0
1332; SSSE3-NEXT: pslld $16, %xmm3
1333; SSSE3-NEXT: psrad $16, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001334; SSSE3-NEXT: pslld $16, %xmm1
1335; SSSE3-NEXT: psrad $16, %xmm1
Simon Pilgrim0be55672018-02-11 10:52:37 +00001336; SSSE3-NEXT: packssdw %xmm3, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001337; SSSE3-NEXT: retq
1338;
1339; SSE41-LABEL: trunc_packus_v16i32_v16i16:
1340; SSE41: # %bb.0:
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001341; SSE41-NEXT: packusdw %xmm1, %xmm0
Simon Pilgrim86d15bf2018-02-14 14:14:29 +00001342; SSE41-NEXT: packusdw %xmm3, %xmm2
1343; SSE41-NEXT: movdqa %xmm2, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001344; SSE41-NEXT: retq
1345;
1346; AVX1-LABEL: trunc_packus_v16i32_v16i16:
1347; AVX1: # %bb.0:
1348; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001349; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
Simon Pilgrim86d15bf2018-02-14 14:14:29 +00001350; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001351; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1352; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1353; AVX1-NEXT: retq
1354;
1355; AVX2-LABEL: trunc_packus_v16i32_v16i16:
1356; AVX2: # %bb.0:
Simon Pilgrim86d15bf2018-02-14 14:14:29 +00001357; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1358; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001359; AVX2-NEXT: retq
1360;
1361; AVX512-LABEL: trunc_packus_v16i32_v16i16:
1362; AVX512: # %bb.0:
1363; AVX512-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0
1364; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1365; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
1366; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1367; AVX512-NEXT: retq
1368 %1 = icmp slt <16 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1369 %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1370 %3 = icmp sgt <16 x i32> %2, zeroinitializer
1371 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1372 %5 = trunc <16 x i32> %4 to <16 x i16>
1373 ret <16 x i16> %5
1374}
1375
1376;
1377; PACKUS saturation truncation to v16i8
1378;
1379
1380define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) {
1381; SSE2-LABEL: trunc_packus_v8i64_v8i8:
1382; SSE2: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +00001383; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
1384; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
1385; SSE2-NEXT: movdqa %xmm1, %xmm5
1386; SSE2-NEXT: pxor %xmm10, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001387; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
1388; SSE2-NEXT: movdqa %xmm9, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001389; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
1390; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1391; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
Simon Pilgrim0be55672018-02-11 10:52:37 +00001392; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
1393; SSE2-NEXT: pand %xmm7, %xmm4
1394; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
1395; SSE2-NEXT: por %xmm4, %xmm5
1396; SSE2-NEXT: pand %xmm5, %xmm1
1397; SSE2-NEXT: pandn %xmm8, %xmm5
1398; SSE2-NEXT: por %xmm1, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001399; SSE2-NEXT: movdqa %xmm0, %xmm1
Simon Pilgrim0be55672018-02-11 10:52:37 +00001400; SSE2-NEXT: pxor %xmm10, %xmm1
1401; SSE2-NEXT: movdqa %xmm9, %xmm4
1402; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001403; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
Simon Pilgrim0be55672018-02-11 10:52:37 +00001404; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
1405; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
1406; SSE2-NEXT: pand %xmm6, %xmm7
1407; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
1408; SSE2-NEXT: por %xmm7, %xmm1
1409; SSE2-NEXT: pand %xmm1, %xmm0
1410; SSE2-NEXT: pandn %xmm8, %xmm1
1411; SSE2-NEXT: por %xmm0, %xmm1
1412; SSE2-NEXT: movdqa %xmm3, %xmm0
1413; SSE2-NEXT: pxor %xmm10, %xmm0
1414; SSE2-NEXT: movdqa %xmm9, %xmm4
1415; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
1416; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
1417; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
1418; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1419; SSE2-NEXT: pand %xmm6, %xmm0
1420; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
1421; SSE2-NEXT: por %xmm0, %xmm6
1422; SSE2-NEXT: pand %xmm6, %xmm3
1423; SSE2-NEXT: pandn %xmm8, %xmm6
1424; SSE2-NEXT: por %xmm3, %xmm6
1425; SSE2-NEXT: movdqa %xmm2, %xmm0
1426; SSE2-NEXT: pxor %xmm10, %xmm0
1427; SSE2-NEXT: movdqa %xmm9, %xmm3
1428; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
1429; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
1430; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
1431; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1432; SSE2-NEXT: pand %xmm4, %xmm0
1433; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1434; SSE2-NEXT: por %xmm0, %xmm3
1435; SSE2-NEXT: pand %xmm3, %xmm2
1436; SSE2-NEXT: pandn %xmm8, %xmm3
1437; SSE2-NEXT: por %xmm2, %xmm3
1438; SSE2-NEXT: movdqa %xmm3, %xmm0
1439; SSE2-NEXT: pxor %xmm10, %xmm0
1440; SSE2-NEXT: movdqa %xmm0, %xmm2
1441; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
1442; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1443; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
1444; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
1445; SSE2-NEXT: pand %xmm4, %xmm7
1446; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
1447; SSE2-NEXT: por %xmm7, %xmm0
1448; SSE2-NEXT: pand %xmm3, %xmm0
1449; SSE2-NEXT: movdqa %xmm6, %xmm2
1450; SSE2-NEXT: pxor %xmm10, %xmm2
1451; SSE2-NEXT: movdqa %xmm2, %xmm3
1452; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
1453; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
1454; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
1455; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
1456; SSE2-NEXT: pand %xmm4, %xmm7
1457; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
1458; SSE2-NEXT: por %xmm7, %xmm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001459; SSE2-NEXT: pand %xmm6, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00001460; SSE2-NEXT: movdqa %xmm1, %xmm3
1461; SSE2-NEXT: pxor %xmm10, %xmm3
1462; SSE2-NEXT: movdqa %xmm3, %xmm4
1463; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
1464; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
1465; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
1466; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1467; SSE2-NEXT: pand %xmm6, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001468; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +00001469; SSE2-NEXT: por %xmm3, %xmm4
1470; SSE2-NEXT: pand %xmm1, %xmm4
1471; SSE2-NEXT: movdqa %xmm5, %xmm1
1472; SSE2-NEXT: pxor %xmm10, %xmm1
1473; SSE2-NEXT: movdqa %xmm1, %xmm3
1474; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
1475; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
1476; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
1477; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1478; SSE2-NEXT: pand %xmm6, %xmm1
1479; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1480; SSE2-NEXT: por %xmm1, %xmm3
1481; SSE2-NEXT: pand %xmm5, %xmm3
1482; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001483; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
Simon Pilgrim0be55672018-02-11 10:52:37 +00001484; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
1485; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
1486; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1487; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1488; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
1489; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001490; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
Simon Pilgrim0be55672018-02-11 10:52:37 +00001491; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1492; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001493; SSE2-NEXT: retq
1494;
1495; SSSE3-LABEL: trunc_packus_v8i64_v8i8:
1496; SSSE3: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +00001497; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
1498; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
1499; SSSE3-NEXT: movdqa %xmm1, %xmm5
1500; SSSE3-NEXT: pxor %xmm10, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001501; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
1502; SSSE3-NEXT: movdqa %xmm9, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001503; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
1504; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1505; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
Simon Pilgrim0be55672018-02-11 10:52:37 +00001506; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
1507; SSSE3-NEXT: pand %xmm7, %xmm4
1508; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
1509; SSSE3-NEXT: por %xmm4, %xmm5
1510; SSSE3-NEXT: pand %xmm5, %xmm1
1511; SSSE3-NEXT: pandn %xmm8, %xmm5
1512; SSSE3-NEXT: por %xmm1, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001513; SSSE3-NEXT: movdqa %xmm0, %xmm1
Simon Pilgrim0be55672018-02-11 10:52:37 +00001514; SSSE3-NEXT: pxor %xmm10, %xmm1
1515; SSSE3-NEXT: movdqa %xmm9, %xmm4
1516; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001517; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
Simon Pilgrim0be55672018-02-11 10:52:37 +00001518; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
1519; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
1520; SSSE3-NEXT: pand %xmm6, %xmm7
1521; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
1522; SSSE3-NEXT: por %xmm7, %xmm1
1523; SSSE3-NEXT: pand %xmm1, %xmm0
1524; SSSE3-NEXT: pandn %xmm8, %xmm1
1525; SSSE3-NEXT: por %xmm0, %xmm1
1526; SSSE3-NEXT: movdqa %xmm3, %xmm0
1527; SSSE3-NEXT: pxor %xmm10, %xmm0
1528; SSSE3-NEXT: movdqa %xmm9, %xmm4
1529; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
1530; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
1531; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
1532; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1533; SSSE3-NEXT: pand %xmm6, %xmm0
1534; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
1535; SSSE3-NEXT: por %xmm0, %xmm6
1536; SSSE3-NEXT: pand %xmm6, %xmm3
1537; SSSE3-NEXT: pandn %xmm8, %xmm6
1538; SSSE3-NEXT: por %xmm3, %xmm6
1539; SSSE3-NEXT: movdqa %xmm2, %xmm0
1540; SSSE3-NEXT: pxor %xmm10, %xmm0
1541; SSSE3-NEXT: movdqa %xmm9, %xmm3
1542; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
1543; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
1544; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
1545; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1546; SSSE3-NEXT: pand %xmm4, %xmm0
1547; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1548; SSSE3-NEXT: por %xmm0, %xmm3
1549; SSSE3-NEXT: pand %xmm3, %xmm2
1550; SSSE3-NEXT: pandn %xmm8, %xmm3
1551; SSSE3-NEXT: por %xmm2, %xmm3
1552; SSSE3-NEXT: movdqa %xmm3, %xmm0
1553; SSSE3-NEXT: pxor %xmm10, %xmm0
1554; SSSE3-NEXT: movdqa %xmm0, %xmm2
1555; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
1556; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1557; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
1558; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
1559; SSSE3-NEXT: pand %xmm4, %xmm7
1560; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
1561; SSSE3-NEXT: por %xmm7, %xmm0
1562; SSSE3-NEXT: pand %xmm3, %xmm0
1563; SSSE3-NEXT: movdqa %xmm6, %xmm2
1564; SSSE3-NEXT: pxor %xmm10, %xmm2
1565; SSSE3-NEXT: movdqa %xmm2, %xmm3
1566; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
1567; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
1568; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
1569; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
1570; SSSE3-NEXT: pand %xmm4, %xmm7
1571; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
1572; SSSE3-NEXT: por %xmm7, %xmm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001573; SSSE3-NEXT: pand %xmm6, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00001574; SSSE3-NEXT: movdqa %xmm1, %xmm3
1575; SSSE3-NEXT: pxor %xmm10, %xmm3
1576; SSSE3-NEXT: movdqa %xmm3, %xmm4
1577; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
1578; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
1579; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
1580; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1581; SSSE3-NEXT: pand %xmm6, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001582; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +00001583; SSSE3-NEXT: por %xmm3, %xmm4
1584; SSSE3-NEXT: pand %xmm1, %xmm4
1585; SSSE3-NEXT: movdqa %xmm5, %xmm1
1586; SSSE3-NEXT: pxor %xmm10, %xmm1
1587; SSSE3-NEXT: movdqa %xmm1, %xmm3
1588; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
1589; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
1590; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
1591; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1592; SSSE3-NEXT: pand %xmm6, %xmm1
1593; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1594; SSSE3-NEXT: por %xmm1, %xmm3
1595; SSSE3-NEXT: pand %xmm5, %xmm3
1596; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001597; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
Simon Pilgrim0be55672018-02-11 10:52:37 +00001598; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
1599; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
1600; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1601; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1602; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
1603; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001604; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
Simon Pilgrim0be55672018-02-11 10:52:37 +00001605; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1606; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001607; SSSE3-NEXT: retq
1608;
1609; SSE41-LABEL: trunc_packus_v8i64_v8i8:
1610; SSE41: # %bb.0:
1611; SSE41-NEXT: movdqa %xmm0, %xmm8
Simon Pilgrim0be55672018-02-11 10:52:37 +00001612; SSE41-NEXT: movapd {{.*#+}} xmm7 = [255,255]
1613; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
1614; SSE41-NEXT: movdqa %xmm3, %xmm0
1615; SSE41-NEXT: pxor %xmm9, %xmm0
1616; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483903,2147483903]
1617; SSE41-NEXT: movdqa %xmm10, %xmm6
1618; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
1619; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2]
1620; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
1621; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1622; SSE41-NEXT: pand %xmm5, %xmm4
1623; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
1624; SSE41-NEXT: por %xmm4, %xmm0
1625; SSE41-NEXT: movapd %xmm7, %xmm6
1626; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
1627; SSE41-NEXT: movdqa %xmm2, %xmm0
1628; SSE41-NEXT: pxor %xmm9, %xmm0
1629; SSE41-NEXT: movdqa %xmm10, %xmm3
1630; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
1631; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
1632; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
1633; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001634; SSE41-NEXT: pand %xmm4, %xmm5
Simon Pilgrim0be55672018-02-11 10:52:37 +00001635; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
1636; SSE41-NEXT: por %xmm5, %xmm0
1637; SSE41-NEXT: movapd %xmm7, %xmm3
1638; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
1639; SSE41-NEXT: movdqa %xmm1, %xmm0
1640; SSE41-NEXT: pxor %xmm9, %xmm0
1641; SSE41-NEXT: movdqa %xmm10, %xmm2
1642; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
1643; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1644; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
1645; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
1646; SSE41-NEXT: pand %xmm4, %xmm5
1647; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
1648; SSE41-NEXT: por %xmm5, %xmm0
1649; SSE41-NEXT: movapd %xmm7, %xmm2
1650; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
1651; SSE41-NEXT: movdqa %xmm8, %xmm0
1652; SSE41-NEXT: pxor %xmm9, %xmm0
1653; SSE41-NEXT: movdqa %xmm10, %xmm1
1654; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
1655; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
1656; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
1657; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
1658; SSE41-NEXT: pand %xmm4, %xmm5
1659; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
1660; SSE41-NEXT: por %xmm5, %xmm0
1661; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
1662; SSE41-NEXT: movapd %xmm7, %xmm0
1663; SSE41-NEXT: xorpd %xmm9, %xmm0
1664; SSE41-NEXT: movapd %xmm0, %xmm1
1665; SSE41-NEXT: pcmpgtd %xmm9, %xmm1
1666; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
1667; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
1668; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
1669; SSE41-NEXT: pand %xmm4, %xmm5
1670; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
1671; SSE41-NEXT: por %xmm5, %xmm0
1672; SSE41-NEXT: pxor %xmm8, %xmm8
1673; SSE41-NEXT: pxor %xmm1, %xmm1
1674; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
1675; SSE41-NEXT: movapd %xmm2, %xmm0
1676; SSE41-NEXT: xorpd %xmm9, %xmm0
1677; SSE41-NEXT: movapd %xmm0, %xmm5
1678; SSE41-NEXT: pcmpgtd %xmm9, %xmm5
1679; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
1680; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
1681; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1682; SSE41-NEXT: pand %xmm7, %xmm4
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001683; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
1684; SSE41-NEXT: por %xmm4, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00001685; SSE41-NEXT: pxor %xmm7, %xmm7
1686; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm7
1687; SSE41-NEXT: movapd %xmm3, %xmm0
1688; SSE41-NEXT: xorpd %xmm9, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001689; SSE41-NEXT: movapd %xmm0, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00001690; SSE41-NEXT: pcmpgtd %xmm9, %xmm2
1691; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1692; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
1693; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
1694; SSE41-NEXT: pand %xmm4, %xmm5
1695; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
1696; SSE41-NEXT: por %xmm5, %xmm0
1697; SSE41-NEXT: pxor %xmm2, %xmm2
1698; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
1699; SSE41-NEXT: movapd %xmm6, %xmm0
1700; SSE41-NEXT: xorpd %xmm9, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001701; SSE41-NEXT: movapd %xmm0, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +00001702; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001703; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
Simon Pilgrim0be55672018-02-11 10:52:37 +00001704; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
1705; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
1706; SSE41-NEXT: pand %xmm4, %xmm5
1707; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
1708; SSE41-NEXT: por %xmm5, %xmm0
1709; SSE41-NEXT: pxor %xmm3, %xmm3
1710; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3
1711; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1,2,3],xmm3[4],xmm8[5,6,7]
1712; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2,3],xmm2[4],xmm8[5,6,7]
1713; SSE41-NEXT: packusdw %xmm3, %xmm2
1714; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1,2,3],xmm7[4],xmm8[5,6,7]
1715; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1,2,3],xmm1[4],xmm8[5,6,7]
1716; SSE41-NEXT: packusdw %xmm7, %xmm1
1717; SSE41-NEXT: packusdw %xmm2, %xmm1
1718; SSE41-NEXT: movdqa %xmm1, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001719; SSE41-NEXT: retq
1720;
1721; AVX1-LABEL: trunc_packus_v8i64_v8i8:
1722; AVX1: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +00001723; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255]
1724; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1725; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255]
1726; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
1727; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5
1728; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
1729; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
1730; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1731; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
1732; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm4
1733; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
1734; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001735; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2
1736; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm8
1737; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
1738; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
1739; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm6
1740; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
1741; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm3
1742; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1743; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7]
1744; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1
1745; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1746; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1747; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
1748; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7]
1749; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0
1750; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1751; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1752; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1753; AVX1-NEXT: vzeroupper
1754; AVX1-NEXT: retq
1755;
1756; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i8:
1757; AVX2-SLOW: # %bb.0:
1758; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
1759; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001760; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00001761; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
1762; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001763; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00001764; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
1765; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1
1766; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
1767; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001768; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1769; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1770; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1771; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1772; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1773; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1774; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
Puyan Lotfi43e94b12018-01-31 22:04:26 +00001775; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001776; AVX2-SLOW-NEXT: vzeroupper
1777; AVX2-SLOW-NEXT: retq
1778;
1779; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i8:
1780; AVX2-FAST: # %bb.0:
1781; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
1782; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001783; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00001784; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
1785; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001786; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00001787; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
1788; AVX2-FAST-NEXT: vpand %ymm1, %ymm3, %ymm1
1789; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
1790; AVX2-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001791; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1792; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
1793; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
1794; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1795; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1796; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
Puyan Lotfi43e94b12018-01-31 22:04:26 +00001797; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001798; AVX2-FAST-NEXT: vzeroupper
1799; AVX2-FAST-NEXT: retq
1800;
1801; AVX512-LABEL: trunc_packus_v8i64_v8i8:
1802; AVX512: # %bb.0:
1803; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
1804; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1805; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
1806; AVX512-NEXT: vpmovqw %zmm0, %xmm0
1807; AVX512-NEXT: vzeroupper
1808; AVX512-NEXT: retq
1809 %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
1810 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
1811 %3 = icmp sgt <8 x i64> %2, zeroinitializer
1812 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1813 %5 = trunc <8 x i64> %4 to <8 x i8>
1814 ret <8 x i8> %5
1815}
1816
Simon Pilgrim689d8132018-02-15 17:48:34 +00001817define void @trunc_packus_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
1818; SSE2-LABEL: trunc_packus_v8i64_v8i8_store:
1819; SSE2: # %bb.0:
1820; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
1821; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
1822; SSE2-NEXT: movdqa %xmm3, %xmm4
1823; SSE2-NEXT: pxor %xmm10, %xmm4
1824; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
1825; SSE2-NEXT: movdqa %xmm9, %xmm6
1826; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
1827; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1828; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
1829; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1830; SSE2-NEXT: pand %xmm7, %xmm4
1831; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
1832; SSE2-NEXT: por %xmm4, %xmm11
1833; SSE2-NEXT: pand %xmm11, %xmm3
1834; SSE2-NEXT: pandn %xmm8, %xmm11
1835; SSE2-NEXT: por %xmm3, %xmm11
1836; SSE2-NEXT: movdqa %xmm2, %xmm3
1837; SSE2-NEXT: pxor %xmm10, %xmm3
1838; SSE2-NEXT: movdqa %xmm9, %xmm4
1839; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
1840; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
1841; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
1842; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
1843; SSE2-NEXT: pand %xmm7, %xmm5
1844; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
1845; SSE2-NEXT: por %xmm5, %xmm3
1846; SSE2-NEXT: pand %xmm3, %xmm2
1847; SSE2-NEXT: pandn %xmm8, %xmm3
1848; SSE2-NEXT: por %xmm2, %xmm3
1849; SSE2-NEXT: movdqa %xmm1, %xmm2
1850; SSE2-NEXT: pxor %xmm10, %xmm2
1851; SSE2-NEXT: movdqa %xmm9, %xmm4
1852; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
1853; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1854; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
1855; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
1856; SSE2-NEXT: pand %xmm5, %xmm7
1857; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1858; SSE2-NEXT: por %xmm7, %xmm2
1859; SSE2-NEXT: pand %xmm2, %xmm1
1860; SSE2-NEXT: pandn %xmm8, %xmm2
1861; SSE2-NEXT: por %xmm1, %xmm2
1862; SSE2-NEXT: movdqa %xmm0, %xmm1
1863; SSE2-NEXT: pxor %xmm10, %xmm1
1864; SSE2-NEXT: movdqa %xmm9, %xmm4
1865; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
1866; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1867; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
1868; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
1869; SSE2-NEXT: pand %xmm5, %xmm7
1870; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
1871; SSE2-NEXT: por %xmm7, %xmm1
1872; SSE2-NEXT: pand %xmm1, %xmm0
1873; SSE2-NEXT: pandn %xmm8, %xmm1
1874; SSE2-NEXT: por %xmm0, %xmm1
1875; SSE2-NEXT: movdqa %xmm1, %xmm0
1876; SSE2-NEXT: pxor %xmm10, %xmm0
1877; SSE2-NEXT: movdqa %xmm0, %xmm4
1878; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
1879; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1880; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
1881; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
1882; SSE2-NEXT: pand %xmm5, %xmm7
1883; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
1884; SSE2-NEXT: por %xmm7, %xmm0
1885; SSE2-NEXT: movdqa %xmm2, %xmm4
1886; SSE2-NEXT: pxor %xmm10, %xmm4
1887; SSE2-NEXT: movdqa %xmm4, %xmm5
1888; SSE2-NEXT: pcmpgtd %xmm10, %xmm5
1889; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
1890; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
1891; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1892; SSE2-NEXT: pand %xmm7, %xmm4
1893; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
1894; SSE2-NEXT: por %xmm4, %xmm7
1895; SSE2-NEXT: movdqa %xmm3, %xmm4
1896; SSE2-NEXT: pxor %xmm10, %xmm4
1897; SSE2-NEXT: movdqa %xmm4, %xmm5
1898; SSE2-NEXT: pcmpgtd %xmm10, %xmm5
1899; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
1900; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
1901; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
1902; SSE2-NEXT: pand %xmm9, %xmm6
1903; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
1904; SSE2-NEXT: por %xmm6, %xmm4
1905; SSE2-NEXT: movdqa %xmm11, %xmm5
1906; SSE2-NEXT: pxor %xmm10, %xmm5
1907; SSE2-NEXT: movdqa %xmm5, %xmm6
1908; SSE2-NEXT: pcmpgtd %xmm10, %xmm6
1909; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
1910; SSE2-NEXT: pcmpeqd %xmm10, %xmm5
1911; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1912; SSE2-NEXT: pand %xmm9, %xmm5
1913; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1914; SSE2-NEXT: por %xmm5, %xmm6
1915; SSE2-NEXT: pand %xmm8, %xmm6
1916; SSE2-NEXT: pand %xmm11, %xmm6
1917; SSE2-NEXT: pand %xmm8, %xmm4
1918; SSE2-NEXT: pand %xmm3, %xmm4
1919; SSE2-NEXT: packuswb %xmm6, %xmm4
1920; SSE2-NEXT: pand %xmm8, %xmm7
1921; SSE2-NEXT: pand %xmm2, %xmm7
1922; SSE2-NEXT: pand %xmm8, %xmm0
1923; SSE2-NEXT: pand %xmm1, %xmm0
1924; SSE2-NEXT: packuswb %xmm7, %xmm0
1925; SSE2-NEXT: packuswb %xmm4, %xmm0
1926; SSE2-NEXT: packuswb %xmm0, %xmm0
1927; SSE2-NEXT: movq %xmm0, (%rdi)
1928; SSE2-NEXT: retq
1929;
1930; SSSE3-LABEL: trunc_packus_v8i64_v8i8_store:
1931; SSSE3: # %bb.0:
1932; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
1933; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
1934; SSSE3-NEXT: movdqa %xmm3, %xmm4
1935; SSSE3-NEXT: pxor %xmm10, %xmm4
1936; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
1937; SSSE3-NEXT: movdqa %xmm9, %xmm6
1938; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
1939; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1940; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
1941; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1942; SSSE3-NEXT: pand %xmm7, %xmm4
1943; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
1944; SSSE3-NEXT: por %xmm4, %xmm11
1945; SSSE3-NEXT: pand %xmm11, %xmm3
1946; SSSE3-NEXT: pandn %xmm8, %xmm11
1947; SSSE3-NEXT: por %xmm3, %xmm11
1948; SSSE3-NEXT: movdqa %xmm2, %xmm3
1949; SSSE3-NEXT: pxor %xmm10, %xmm3
1950; SSSE3-NEXT: movdqa %xmm9, %xmm4
1951; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
1952; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
1953; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
1954; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
1955; SSSE3-NEXT: pand %xmm7, %xmm5
1956; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
1957; SSSE3-NEXT: por %xmm5, %xmm3
1958; SSSE3-NEXT: pand %xmm3, %xmm2
1959; SSSE3-NEXT: pandn %xmm8, %xmm3
1960; SSSE3-NEXT: por %xmm2, %xmm3
1961; SSSE3-NEXT: movdqa %xmm1, %xmm2
1962; SSSE3-NEXT: pxor %xmm10, %xmm2
1963; SSSE3-NEXT: movdqa %xmm9, %xmm4
1964; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
1965; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1966; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
1967; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
1968; SSSE3-NEXT: pand %xmm5, %xmm7
1969; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1970; SSSE3-NEXT: por %xmm7, %xmm2
1971; SSSE3-NEXT: pand %xmm2, %xmm1
1972; SSSE3-NEXT: pandn %xmm8, %xmm2
1973; SSSE3-NEXT: por %xmm1, %xmm2
1974; SSSE3-NEXT: movdqa %xmm0, %xmm1
1975; SSSE3-NEXT: pxor %xmm10, %xmm1
1976; SSSE3-NEXT: movdqa %xmm9, %xmm4
1977; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
1978; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1979; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
1980; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
1981; SSSE3-NEXT: pand %xmm5, %xmm7
1982; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
1983; SSSE3-NEXT: por %xmm7, %xmm1
1984; SSSE3-NEXT: pand %xmm1, %xmm0
1985; SSSE3-NEXT: pandn %xmm8, %xmm1
1986; SSSE3-NEXT: por %xmm0, %xmm1
1987; SSSE3-NEXT: movdqa %xmm1, %xmm0
1988; SSSE3-NEXT: pxor %xmm10, %xmm0
1989; SSSE3-NEXT: movdqa %xmm0, %xmm4
1990; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
1991; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1992; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
1993; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
1994; SSSE3-NEXT: pand %xmm5, %xmm7
1995; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
1996; SSSE3-NEXT: por %xmm7, %xmm0
1997; SSSE3-NEXT: movdqa %xmm2, %xmm4
1998; SSSE3-NEXT: pxor %xmm10, %xmm4
1999; SSSE3-NEXT: movdqa %xmm4, %xmm5
2000; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5
2001; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
2002; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4
2003; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2004; SSSE3-NEXT: pand %xmm7, %xmm4
2005; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
2006; SSSE3-NEXT: por %xmm4, %xmm7
2007; SSSE3-NEXT: movdqa %xmm3, %xmm4
2008; SSSE3-NEXT: pxor %xmm10, %xmm4
2009; SSSE3-NEXT: movdqa %xmm4, %xmm5
2010; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5
2011; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
2012; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4
2013; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
2014; SSSE3-NEXT: pand %xmm9, %xmm6
2015; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
2016; SSSE3-NEXT: por %xmm6, %xmm4
2017; SSSE3-NEXT: movdqa %xmm11, %xmm5
2018; SSSE3-NEXT: pxor %xmm10, %xmm5
2019; SSSE3-NEXT: movdqa %xmm5, %xmm6
2020; SSSE3-NEXT: pcmpgtd %xmm10, %xmm6
2021; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
2022; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5
2023; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2024; SSSE3-NEXT: pand %xmm9, %xmm5
2025; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2026; SSSE3-NEXT: por %xmm5, %xmm6
2027; SSSE3-NEXT: pand %xmm8, %xmm6
2028; SSSE3-NEXT: pand %xmm11, %xmm6
2029; SSSE3-NEXT: pand %xmm8, %xmm4
2030; SSSE3-NEXT: pand %xmm3, %xmm4
2031; SSSE3-NEXT: packuswb %xmm6, %xmm4
2032; SSSE3-NEXT: pand %xmm8, %xmm7
2033; SSSE3-NEXT: pand %xmm2, %xmm7
2034; SSSE3-NEXT: pand %xmm8, %xmm0
2035; SSSE3-NEXT: pand %xmm1, %xmm0
2036; SSSE3-NEXT: packuswb %xmm7, %xmm0
2037; SSSE3-NEXT: packuswb %xmm4, %xmm0
2038; SSSE3-NEXT: packuswb %xmm0, %xmm0
2039; SSSE3-NEXT: movq %xmm0, (%rdi)
2040; SSSE3-NEXT: retq
2041;
2042; SSE41-LABEL: trunc_packus_v8i64_v8i8_store:
2043; SSE41: # %bb.0:
2044; SSE41-NEXT: movdqa %xmm0, %xmm9
2045; SSE41-NEXT: movapd {{.*#+}} xmm8 = [255,255]
2046; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
2047; SSE41-NEXT: movdqa %xmm3, %xmm0
2048; SSE41-NEXT: pxor %xmm10, %xmm0
2049; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903]
2050; SSE41-NEXT: movdqa %xmm5, %xmm4
2051; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
2052; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
2053; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
2054; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
2055; SSE41-NEXT: pand %xmm7, %xmm6
2056; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
2057; SSE41-NEXT: por %xmm6, %xmm0
2058; SSE41-NEXT: movapd %xmm8, %xmm11
2059; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11
2060; SSE41-NEXT: movdqa %xmm2, %xmm0
2061; SSE41-NEXT: pxor %xmm10, %xmm0
2062; SSE41-NEXT: movdqa %xmm5, %xmm3
2063; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
2064; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
2065; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
2066; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
2067; SSE41-NEXT: pand %xmm4, %xmm6
2068; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
2069; SSE41-NEXT: por %xmm6, %xmm0
2070; SSE41-NEXT: movapd %xmm8, %xmm3
2071; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
2072; SSE41-NEXT: movdqa %xmm1, %xmm0
2073; SSE41-NEXT: pxor %xmm10, %xmm0
2074; SSE41-NEXT: movdqa %xmm5, %xmm2
2075; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
2076; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
2077; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
2078; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
2079; SSE41-NEXT: pand %xmm4, %xmm6
2080; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
2081; SSE41-NEXT: por %xmm6, %xmm0
2082; SSE41-NEXT: movapd %xmm8, %xmm4
2083; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
2084; SSE41-NEXT: movdqa %xmm9, %xmm0
2085; SSE41-NEXT: pxor %xmm10, %xmm0
2086; SSE41-NEXT: movdqa %xmm5, %xmm1
2087; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
2088; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
2089; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
2090; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2091; SSE41-NEXT: pand %xmm2, %xmm5
2092; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
2093; SSE41-NEXT: por %xmm5, %xmm0
2094; SSE41-NEXT: movapd %xmm8, %xmm5
2095; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5
2096; SSE41-NEXT: movapd %xmm5, %xmm0
2097; SSE41-NEXT: xorpd %xmm10, %xmm0
2098; SSE41-NEXT: movapd %xmm0, %xmm1
2099; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
2100; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
2101; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
2102; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
2103; SSE41-NEXT: pand %xmm2, %xmm6
2104; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
2105; SSE41-NEXT: por %xmm6, %xmm0
2106; SSE41-NEXT: pxor %xmm2, %xmm2
2107; SSE41-NEXT: pxor %xmm1, %xmm1
2108; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
2109; SSE41-NEXT: movapd %xmm4, %xmm0
2110; SSE41-NEXT: xorpd %xmm10, %xmm0
2111; SSE41-NEXT: movapd %xmm0, %xmm5
2112; SSE41-NEXT: pcmpgtd %xmm10, %xmm5
2113; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
2114; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
2115; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
2116; SSE41-NEXT: pand %xmm6, %xmm7
2117; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
2118; SSE41-NEXT: por %xmm7, %xmm0
2119; SSE41-NEXT: pxor %xmm5, %xmm5
2120; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
2121; SSE41-NEXT: movapd %xmm3, %xmm0
2122; SSE41-NEXT: xorpd %xmm10, %xmm0
2123; SSE41-NEXT: movapd %xmm0, %xmm4
2124; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
2125; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
2126; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
2127; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
2128; SSE41-NEXT: pand %xmm6, %xmm7
2129; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
2130; SSE41-NEXT: por %xmm7, %xmm0
2131; SSE41-NEXT: pxor %xmm4, %xmm4
2132; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
2133; SSE41-NEXT: movapd %xmm11, %xmm0
2134; SSE41-NEXT: xorpd %xmm10, %xmm0
2135; SSE41-NEXT: movapd %xmm0, %xmm3
2136; SSE41-NEXT: pcmpgtd %xmm10, %xmm3
2137; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
2138; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
2139; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
2140; SSE41-NEXT: pand %xmm6, %xmm7
2141; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
2142; SSE41-NEXT: por %xmm7, %xmm0
2143; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm2
2144; SSE41-NEXT: andpd %xmm8, %xmm2
2145; SSE41-NEXT: andpd %xmm8, %xmm4
2146; SSE41-NEXT: packuswb %xmm2, %xmm4
2147; SSE41-NEXT: andpd %xmm8, %xmm5
2148; SSE41-NEXT: andpd %xmm8, %xmm1
2149; SSE41-NEXT: packuswb %xmm5, %xmm1
2150; SSE41-NEXT: packuswb %xmm4, %xmm1
2151; SSE41-NEXT: packuswb %xmm1, %xmm1
2152; SSE41-NEXT: movq %xmm1, (%rdi)
2153; SSE41-NEXT: retq
2154;
2155; AVX1-LABEL: trunc_packus_v8i64_v8i8_store:
2156; AVX1: # %bb.0:
2157; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255]
2158; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2159; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255]
2160; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
2161; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5
2162; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
2163; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
2164; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
2165; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
2166; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5
2167; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
2168; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
2169; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2
2170; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm8
2171; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
2172; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm6
2173; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm7
2174; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2175; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
2176; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2177; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2178; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2179; AVX1-NEXT: vpand %xmm1, %xmm7, %xmm1
2180; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2181; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm2
2182; AVX1-NEXT: vpand %xmm2, %xmm6, %xmm2
2183; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2184; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0
2185; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2186; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2187; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
2188; AVX1-NEXT: vmovq %xmm0, (%rdi)
2189; AVX1-NEXT: vzeroupper
2190; AVX1-NEXT: retq
2191;
2192; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i8_store:
2193; AVX2-SLOW: # %bb.0:
2194; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
2195; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
2196; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
2197; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
2198; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
2199; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
2200; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
2201; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0
2202; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
2203; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1
2204; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
2205; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2206; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2207; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2208; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2209; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2210; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2211; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2212; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
2213; AVX2-SLOW-NEXT: vmovq %xmm0, (%rdi)
2214; AVX2-SLOW-NEXT: vzeroupper
2215; AVX2-SLOW-NEXT: retq
2216;
2217; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i8_store:
2218; AVX2-FAST: # %bb.0:
2219; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
2220; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
2221; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
2222; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
2223; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
2224; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
2225; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
2226; AVX2-FAST-NEXT: vpand %ymm0, %ymm3, %ymm0
2227; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
2228; AVX2-FAST-NEXT: vpand %ymm1, %ymm2, %ymm1
2229; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
2230; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
2231; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2232; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2233; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
2234; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
2235; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2236; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
2237; AVX2-FAST-NEXT: vmovq %xmm0, (%rdi)
2238; AVX2-FAST-NEXT: vzeroupper
2239; AVX2-FAST-NEXT: retq
2240;
2241; AVX512-LABEL: trunc_packus_v8i64_v8i8_store:
2242; AVX512: # %bb.0:
2243; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
2244; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
2245; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
2246; AVX512-NEXT: vpmovqb %zmm0, (%rdi)
2247; AVX512-NEXT: vzeroupper
2248; AVX512-NEXT: retq
2249 %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
2250 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
2251 %3 = icmp sgt <8 x i64> %2, zeroinitializer
2252 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
2253 %5 = trunc <8 x i64> %4 to <8 x i8>
2254 store <8 x i8> %5, <8 x i8> *%p1
2255 ret void
2256}
2257
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002258define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
2259; SSE2-LABEL: trunc_packus_v16i64_v16i8:
2260; SSE2: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +00002261; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [255,255]
2262; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
2263; SSE2-NEXT: movdqa %xmm7, %xmm8
2264; SSE2-NEXT: pxor %xmm9, %xmm8
2265; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483903,2147483903]
2266; SSE2-NEXT: movdqa %xmm11, %xmm10
2267; SSE2-NEXT: pcmpgtd %xmm8, %xmm10
2268; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2]
2269; SSE2-NEXT: pcmpeqd %xmm11, %xmm8
2270; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
2271; SSE2-NEXT: pand %xmm12, %xmm8
2272; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
2273; SSE2-NEXT: por %xmm8, %xmm10
2274; SSE2-NEXT: pand %xmm10, %xmm7
2275; SSE2-NEXT: pandn %xmm13, %xmm10
2276; SSE2-NEXT: por %xmm7, %xmm10
2277; SSE2-NEXT: movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill
2278; SSE2-NEXT: movdqa %xmm6, %xmm7
2279; SSE2-NEXT: pxor %xmm9, %xmm7
2280; SSE2-NEXT: movdqa %xmm11, %xmm8
2281; SSE2-NEXT: pcmpgtd %xmm7, %xmm8
2282; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
2283; SSE2-NEXT: pcmpeqd %xmm11, %xmm7
2284; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3]
2285; SSE2-NEXT: pand %xmm12, %xmm10
2286; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm8[1,1,3,3]
2287; SSE2-NEXT: por %xmm10, %xmm14
2288; SSE2-NEXT: pand %xmm14, %xmm6
2289; SSE2-NEXT: pandn %xmm13, %xmm14
2290; SSE2-NEXT: por %xmm6, %xmm14
2291; SSE2-NEXT: movdqa %xmm5, %xmm6
2292; SSE2-NEXT: pxor %xmm9, %xmm6
2293; SSE2-NEXT: movdqa %xmm11, %xmm7
2294; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
2295; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
2296; SSE2-NEXT: pcmpeqd %xmm11, %xmm6
2297; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2298; SSE2-NEXT: pand %xmm8, %xmm6
2299; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
2300; SSE2-NEXT: por %xmm6, %xmm7
2301; SSE2-NEXT: pand %xmm7, %xmm5
2302; SSE2-NEXT: pandn %xmm13, %xmm7
2303; SSE2-NEXT: por %xmm5, %xmm7
2304; SSE2-NEXT: movdqa %xmm7, %xmm10
Quentin Colombet48abac82018-02-17 03:05:33 +00002305; SSE2-NEXT: movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill
Simon Pilgrim0be55672018-02-11 10:52:37 +00002306; SSE2-NEXT: movdqa %xmm4, %xmm5
2307; SSE2-NEXT: pxor %xmm9, %xmm5
2308; SSE2-NEXT: movdqa %xmm11, %xmm6
2309; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
2310; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
2311; SSE2-NEXT: pcmpeqd %xmm11, %xmm5
2312; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2313; SSE2-NEXT: pand %xmm7, %xmm5
2314; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2315; SSE2-NEXT: por %xmm5, %xmm6
2316; SSE2-NEXT: pand %xmm6, %xmm4
2317; SSE2-NEXT: pandn %xmm13, %xmm6
2318; SSE2-NEXT: por %xmm4, %xmm6
2319; SSE2-NEXT: movdqa %xmm6, %xmm7
Quentin Colombet48abac82018-02-17 03:05:33 +00002320; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
Simon Pilgrim0be55672018-02-11 10:52:37 +00002321; SSE2-NEXT: movdqa %xmm3, %xmm4
2322; SSE2-NEXT: pxor %xmm9, %xmm4
2323; SSE2-NEXT: movdqa %xmm11, %xmm5
2324; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
2325; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
2326; SSE2-NEXT: pcmpeqd %xmm11, %xmm4
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002327; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +00002328; SSE2-NEXT: pand %xmm6, %xmm4
2329; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2330; SSE2-NEXT: por %xmm4, %xmm5
2331; SSE2-NEXT: pand %xmm5, %xmm3
2332; SSE2-NEXT: pandn %xmm13, %xmm5
2333; SSE2-NEXT: por %xmm3, %xmm5
2334; SSE2-NEXT: movdqa %xmm5, %xmm8
Quentin Colombet48abac82018-02-17 03:05:33 +00002335; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
Simon Pilgrim0be55672018-02-11 10:52:37 +00002336; SSE2-NEXT: movdqa %xmm2, %xmm3
2337; SSE2-NEXT: pxor %xmm9, %xmm3
2338; SSE2-NEXT: movdqa %xmm11, %xmm4
2339; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
2340; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
2341; SSE2-NEXT: pcmpeqd %xmm11, %xmm3
2342; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2343; SSE2-NEXT: pand %xmm5, %xmm3
2344; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
2345; SSE2-NEXT: por %xmm3, %xmm15
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002346; SSE2-NEXT: pand %xmm15, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00002347; SSE2-NEXT: pandn %xmm13, %xmm15
2348; SSE2-NEXT: movdqa %xmm13, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002349; SSE2-NEXT: por %xmm2, %xmm15
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002350; SSE2-NEXT: movdqa %xmm1, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00002351; SSE2-NEXT: pxor %xmm9, %xmm2
2352; SSE2-NEXT: movdqa %xmm11, %xmm3
2353; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
2354; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
2355; SSE2-NEXT: pcmpeqd %xmm11, %xmm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002356; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002357; SSE2-NEXT: pand %xmm4, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00002358; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3]
2359; SSE2-NEXT: por %xmm2, %xmm13
2360; SSE2-NEXT: pand %xmm13, %xmm1
2361; SSE2-NEXT: pandn %xmm5, %xmm13
2362; SSE2-NEXT: por %xmm1, %xmm13
2363; SSE2-NEXT: movdqa %xmm0, %xmm1
2364; SSE2-NEXT: pxor %xmm9, %xmm1
2365; SSE2-NEXT: movdqa %xmm11, %xmm2
2366; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
2367; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
2368; SSE2-NEXT: pcmpeqd %xmm11, %xmm1
2369; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2370; SSE2-NEXT: pand %xmm4, %xmm1
2371; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3]
2372; SSE2-NEXT: por %xmm1, %xmm11
2373; SSE2-NEXT: pand %xmm11, %xmm0
2374; SSE2-NEXT: pandn %xmm5, %xmm11
2375; SSE2-NEXT: por %xmm0, %xmm11
2376; SSE2-NEXT: movdqa %xmm11, %xmm0
2377; SSE2-NEXT: pxor %xmm9, %xmm0
2378; SSE2-NEXT: movdqa %xmm0, %xmm1
2379; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
2380; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
2381; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
2382; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2383; SSE2-NEXT: pand %xmm4, %xmm0
2384; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
2385; SSE2-NEXT: por %xmm0, %xmm2
2386; SSE2-NEXT: movdqa %xmm13, %xmm0
2387; SSE2-NEXT: pxor %xmm9, %xmm0
2388; SSE2-NEXT: movdqa %xmm0, %xmm4
2389; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
2390; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
2391; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
2392; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2393; SSE2-NEXT: pand %xmm5, %xmm0
2394; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
2395; SSE2-NEXT: por %xmm0, %xmm12
2396; SSE2-NEXT: movdqa %xmm15, %xmm0
2397; SSE2-NEXT: pxor %xmm9, %xmm0
2398; SSE2-NEXT: movdqa %xmm0, %xmm4
2399; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
2400; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
2401; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
2402; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2403; SSE2-NEXT: pand %xmm5, %xmm0
2404; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
2405; SSE2-NEXT: por %xmm0, %xmm6
2406; SSE2-NEXT: movdqa %xmm8, %xmm0
2407; SSE2-NEXT: pxor %xmm9, %xmm0
2408; SSE2-NEXT: movdqa %xmm0, %xmm4
2409; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
2410; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
2411; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
2412; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2413; SSE2-NEXT: pand %xmm5, %xmm0
2414; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
2415; SSE2-NEXT: por %xmm0, %xmm8
2416; SSE2-NEXT: movdqa %xmm7, %xmm0
2417; SSE2-NEXT: pxor %xmm9, %xmm0
2418; SSE2-NEXT: movdqa %xmm0, %xmm5
2419; SSE2-NEXT: pcmpgtd %xmm9, %xmm5
2420; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
2421; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2422; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
2423; SSE2-NEXT: pand %xmm7, %xmm0
2424; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2425; SSE2-NEXT: por %xmm0, %xmm5
2426; SSE2-NEXT: movdqa %xmm10, %xmm0
2427; SSE2-NEXT: pxor %xmm9, %xmm0
2428; SSE2-NEXT: movdqa %xmm0, %xmm7
2429; SSE2-NEXT: pcmpgtd %xmm9, %xmm7
2430; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
2431; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2432; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
2433; SSE2-NEXT: pand %xmm10, %xmm0
2434; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3]
2435; SSE2-NEXT: por %xmm0, %xmm10
2436; SSE2-NEXT: movdqa %xmm14, %xmm0
2437; SSE2-NEXT: pxor %xmm9, %xmm0
2438; SSE2-NEXT: movdqa %xmm0, %xmm7
2439; SSE2-NEXT: pcmpgtd %xmm9, %xmm7
2440; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
2441; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
2442; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
2443; SSE2-NEXT: pand %xmm0, %xmm1
2444; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
2445; SSE2-NEXT: por %xmm1, %xmm0
2446; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
2447; SSE2-NEXT: movdqa %xmm4, %xmm1
2448; SSE2-NEXT: pxor %xmm9, %xmm1
2449; SSE2-NEXT: movdqa %xmm1, %xmm7
2450; SSE2-NEXT: pcmpgtd %xmm9, %xmm7
2451; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
2452; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2453; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002454; SSE2-NEXT: pand %xmm3, %xmm1
Simon Pilgrim0be55672018-02-11 10:52:37 +00002455; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
2456; SSE2-NEXT: por %xmm1, %xmm3
2457; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255]
2458; SSE2-NEXT: pand %xmm1, %xmm3
2459; SSE2-NEXT: pand %xmm4, %xmm3
2460; SSE2-NEXT: pand %xmm1, %xmm0
2461; SSE2-NEXT: pand %xmm14, %xmm0
2462; SSE2-NEXT: packuswb %xmm3, %xmm0
2463; SSE2-NEXT: pand %xmm1, %xmm10
2464; SSE2-NEXT: pand -{{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
2465; SSE2-NEXT: pand %xmm1, %xmm5
2466; SSE2-NEXT: pand -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
2467; SSE2-NEXT: packuswb %xmm10, %xmm5
2468; SSE2-NEXT: packuswb %xmm0, %xmm5
2469; SSE2-NEXT: pand %xmm1, %xmm8
2470; SSE2-NEXT: pand -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
2471; SSE2-NEXT: pand %xmm1, %xmm6
2472; SSE2-NEXT: pand %xmm15, %xmm6
2473; SSE2-NEXT: packuswb %xmm8, %xmm6
2474; SSE2-NEXT: pand %xmm1, %xmm12
2475; SSE2-NEXT: pand %xmm13, %xmm12
2476; SSE2-NEXT: pand %xmm1, %xmm2
2477; SSE2-NEXT: pand %xmm11, %xmm2
2478; SSE2-NEXT: packuswb %xmm12, %xmm2
2479; SSE2-NEXT: packuswb %xmm6, %xmm2
2480; SSE2-NEXT: packuswb %xmm5, %xmm2
2481; SSE2-NEXT: movdqa %xmm2, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002482; SSE2-NEXT: retq
2483;
2484; SSSE3-LABEL: trunc_packus_v16i64_v16i8:
2485; SSSE3: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +00002486; SSSE3-NEXT: movdqa {{.*#+}} xmm13 = [255,255]
2487; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
2488; SSSE3-NEXT: movdqa %xmm7, %xmm8
2489; SSSE3-NEXT: pxor %xmm9, %xmm8
2490; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483903,2147483903]
2491; SSSE3-NEXT: movdqa %xmm11, %xmm10
2492; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10
2493; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2]
2494; SSSE3-NEXT: pcmpeqd %xmm11, %xmm8
2495; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
2496; SSSE3-NEXT: pand %xmm12, %xmm8
2497; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
2498; SSSE3-NEXT: por %xmm8, %xmm10
2499; SSSE3-NEXT: pand %xmm10, %xmm7
2500; SSSE3-NEXT: pandn %xmm13, %xmm10
2501; SSSE3-NEXT: por %xmm7, %xmm10
2502; SSSE3-NEXT: movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill
2503; SSSE3-NEXT: movdqa %xmm6, %xmm7
2504; SSSE3-NEXT: pxor %xmm9, %xmm7
2505; SSSE3-NEXT: movdqa %xmm11, %xmm8
2506; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8
2507; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
2508; SSSE3-NEXT: pcmpeqd %xmm11, %xmm7
2509; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3]
2510; SSSE3-NEXT: pand %xmm12, %xmm10
2511; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm8[1,1,3,3]
2512; SSSE3-NEXT: por %xmm10, %xmm14
2513; SSSE3-NEXT: pand %xmm14, %xmm6
2514; SSSE3-NEXT: pandn %xmm13, %xmm14
2515; SSSE3-NEXT: por %xmm6, %xmm14
2516; SSSE3-NEXT: movdqa %xmm5, %xmm6
2517; SSSE3-NEXT: pxor %xmm9, %xmm6
2518; SSSE3-NEXT: movdqa %xmm11, %xmm7
2519; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
2520; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
2521; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6
2522; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2523; SSSE3-NEXT: pand %xmm8, %xmm6
2524; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
2525; SSSE3-NEXT: por %xmm6, %xmm7
2526; SSSE3-NEXT: pand %xmm7, %xmm5
2527; SSSE3-NEXT: pandn %xmm13, %xmm7
2528; SSSE3-NEXT: por %xmm5, %xmm7
2529; SSSE3-NEXT: movdqa %xmm7, %xmm10
Quentin Colombet48abac82018-02-17 03:05:33 +00002530; SSSE3-NEXT: movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill
Simon Pilgrim0be55672018-02-11 10:52:37 +00002531; SSSE3-NEXT: movdqa %xmm4, %xmm5
2532; SSSE3-NEXT: pxor %xmm9, %xmm5
2533; SSSE3-NEXT: movdqa %xmm11, %xmm6
2534; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
2535; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
2536; SSSE3-NEXT: pcmpeqd %xmm11, %xmm5
2537; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2538; SSSE3-NEXT: pand %xmm7, %xmm5
2539; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2540; SSSE3-NEXT: por %xmm5, %xmm6
2541; SSSE3-NEXT: pand %xmm6, %xmm4
2542; SSSE3-NEXT: pandn %xmm13, %xmm6
2543; SSSE3-NEXT: por %xmm4, %xmm6
2544; SSSE3-NEXT: movdqa %xmm6, %xmm7
Quentin Colombet48abac82018-02-17 03:05:33 +00002545; SSSE3-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
Simon Pilgrim0be55672018-02-11 10:52:37 +00002546; SSSE3-NEXT: movdqa %xmm3, %xmm4
2547; SSSE3-NEXT: pxor %xmm9, %xmm4
2548; SSSE3-NEXT: movdqa %xmm11, %xmm5
2549; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
2550; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
2551; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002552; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +00002553; SSSE3-NEXT: pand %xmm6, %xmm4
2554; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2555; SSSE3-NEXT: por %xmm4, %xmm5
2556; SSSE3-NEXT: pand %xmm5, %xmm3
2557; SSSE3-NEXT: pandn %xmm13, %xmm5
2558; SSSE3-NEXT: por %xmm3, %xmm5
2559; SSSE3-NEXT: movdqa %xmm5, %xmm8
Quentin Colombet48abac82018-02-17 03:05:33 +00002560; SSSE3-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
Simon Pilgrim0be55672018-02-11 10:52:37 +00002561; SSSE3-NEXT: movdqa %xmm2, %xmm3
2562; SSSE3-NEXT: pxor %xmm9, %xmm3
2563; SSSE3-NEXT: movdqa %xmm11, %xmm4
2564; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
2565; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
2566; SSSE3-NEXT: pcmpeqd %xmm11, %xmm3
2567; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2568; SSSE3-NEXT: pand %xmm5, %xmm3
2569; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
2570; SSSE3-NEXT: por %xmm3, %xmm15
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002571; SSSE3-NEXT: pand %xmm15, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00002572; SSSE3-NEXT: pandn %xmm13, %xmm15
2573; SSSE3-NEXT: movdqa %xmm13, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002574; SSSE3-NEXT: por %xmm2, %xmm15
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002575; SSSE3-NEXT: movdqa %xmm1, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00002576; SSSE3-NEXT: pxor %xmm9, %xmm2
2577; SSSE3-NEXT: movdqa %xmm11, %xmm3
2578; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
2579; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
2580; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002581; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002582; SSSE3-NEXT: pand %xmm4, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00002583; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3]
2584; SSSE3-NEXT: por %xmm2, %xmm13
2585; SSSE3-NEXT: pand %xmm13, %xmm1
2586; SSSE3-NEXT: pandn %xmm5, %xmm13
2587; SSSE3-NEXT: por %xmm1, %xmm13
2588; SSSE3-NEXT: movdqa %xmm0, %xmm1
2589; SSSE3-NEXT: pxor %xmm9, %xmm1
2590; SSSE3-NEXT: movdqa %xmm11, %xmm2
2591; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
2592; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
2593; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1
2594; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2595; SSSE3-NEXT: pand %xmm4, %xmm1
2596; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3]
2597; SSSE3-NEXT: por %xmm1, %xmm11
2598; SSSE3-NEXT: pand %xmm11, %xmm0
2599; SSSE3-NEXT: pandn %xmm5, %xmm11
2600; SSSE3-NEXT: por %xmm0, %xmm11
2601; SSSE3-NEXT: movdqa %xmm11, %xmm0
2602; SSSE3-NEXT: pxor %xmm9, %xmm0
2603; SSSE3-NEXT: movdqa %xmm0, %xmm1
2604; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
2605; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
2606; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
2607; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2608; SSSE3-NEXT: pand %xmm4, %xmm0
2609; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
2610; SSSE3-NEXT: por %xmm0, %xmm2
2611; SSSE3-NEXT: movdqa %xmm13, %xmm0
2612; SSSE3-NEXT: pxor %xmm9, %xmm0
2613; SSSE3-NEXT: movdqa %xmm0, %xmm4
2614; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4
2615; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
2616; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
2617; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2618; SSSE3-NEXT: pand %xmm5, %xmm0
2619; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
2620; SSSE3-NEXT: por %xmm0, %xmm12
2621; SSSE3-NEXT: movdqa %xmm15, %xmm0
2622; SSSE3-NEXT: pxor %xmm9, %xmm0
2623; SSSE3-NEXT: movdqa %xmm0, %xmm4
2624; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4
2625; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
2626; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
2627; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2628; SSSE3-NEXT: pand %xmm5, %xmm0
2629; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
2630; SSSE3-NEXT: por %xmm0, %xmm6
2631; SSSE3-NEXT: movdqa %xmm8, %xmm0
2632; SSSE3-NEXT: pxor %xmm9, %xmm0
2633; SSSE3-NEXT: movdqa %xmm0, %xmm4
2634; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4
2635; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
2636; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
2637; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2638; SSSE3-NEXT: pand %xmm5, %xmm0
2639; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
2640; SSSE3-NEXT: por %xmm0, %xmm8
2641; SSSE3-NEXT: movdqa %xmm7, %xmm0
2642; SSSE3-NEXT: pxor %xmm9, %xmm0
2643; SSSE3-NEXT: movdqa %xmm0, %xmm5
2644; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5
2645; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
2646; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2647; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
2648; SSSE3-NEXT: pand %xmm7, %xmm0
2649; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2650; SSSE3-NEXT: por %xmm0, %xmm5
2651; SSSE3-NEXT: movdqa %xmm10, %xmm0
2652; SSSE3-NEXT: pxor %xmm9, %xmm0
2653; SSSE3-NEXT: movdqa %xmm0, %xmm7
2654; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7
2655; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
2656; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2657; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
2658; SSSE3-NEXT: pand %xmm10, %xmm0
2659; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3]
2660; SSSE3-NEXT: por %xmm0, %xmm10
2661; SSSE3-NEXT: movdqa %xmm14, %xmm0
2662; SSSE3-NEXT: pxor %xmm9, %xmm0
2663; SSSE3-NEXT: movdqa %xmm0, %xmm7
2664; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7
2665; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
2666; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
2667; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
2668; SSSE3-NEXT: pand %xmm0, %xmm1
2669; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
2670; SSSE3-NEXT: por %xmm1, %xmm0
2671; SSSE3-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
2672; SSSE3-NEXT: movdqa %xmm4, %xmm1
2673; SSSE3-NEXT: pxor %xmm9, %xmm1
2674; SSSE3-NEXT: movdqa %xmm1, %xmm7
2675; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7
2676; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
2677; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2678; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002679; SSSE3-NEXT: pand %xmm3, %xmm1
Simon Pilgrim0be55672018-02-11 10:52:37 +00002680; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
2681; SSSE3-NEXT: por %xmm1, %xmm3
2682; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255]
2683; SSSE3-NEXT: pand %xmm1, %xmm3
2684; SSSE3-NEXT: pand %xmm4, %xmm3
2685; SSSE3-NEXT: pand %xmm1, %xmm0
2686; SSSE3-NEXT: pand %xmm14, %xmm0
2687; SSSE3-NEXT: packuswb %xmm3, %xmm0
2688; SSSE3-NEXT: pand %xmm1, %xmm10
2689; SSSE3-NEXT: pand -{{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
2690; SSSE3-NEXT: pand %xmm1, %xmm5
2691; SSSE3-NEXT: pand -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
2692; SSSE3-NEXT: packuswb %xmm10, %xmm5
2693; SSSE3-NEXT: packuswb %xmm0, %xmm5
2694; SSSE3-NEXT: pand %xmm1, %xmm8
2695; SSSE3-NEXT: pand -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
2696; SSSE3-NEXT: pand %xmm1, %xmm6
2697; SSSE3-NEXT: pand %xmm15, %xmm6
2698; SSSE3-NEXT: packuswb %xmm8, %xmm6
2699; SSSE3-NEXT: pand %xmm1, %xmm12
2700; SSSE3-NEXT: pand %xmm13, %xmm12
2701; SSSE3-NEXT: pand %xmm1, %xmm2
2702; SSSE3-NEXT: pand %xmm11, %xmm2
2703; SSSE3-NEXT: packuswb %xmm12, %xmm2
2704; SSSE3-NEXT: packuswb %xmm6, %xmm2
2705; SSSE3-NEXT: packuswb %xmm5, %xmm2
2706; SSSE3-NEXT: movdqa %xmm2, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002707; SSSE3-NEXT: retq
2708;
2709; SSE41-LABEL: trunc_packus_v16i64_v16i8:
2710; SSE41: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +00002711; SSE41-NEXT: movdqa %xmm0, %xmm8
2712; SSE41-NEXT: movapd {{.*#+}} xmm9 = [255,255]
2713; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002714; SSE41-NEXT: movdqa %xmm7, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00002715; SSE41-NEXT: pxor %xmm10, %xmm0
2716; SSE41-NEXT: movdqa {{.*#+}} xmm12 = [2147483903,2147483903]
2717; SSE41-NEXT: movdqa %xmm12, %xmm11
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002718; SSE41-NEXT: pcmpgtd %xmm0, %xmm11
Simon Pilgrim0be55672018-02-11 10:52:37 +00002719; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2]
2720; SSE41-NEXT: pcmpeqd %xmm12, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002721; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
Simon Pilgrim0be55672018-02-11 10:52:37 +00002722; SSE41-NEXT: pand %xmm13, %xmm14
2723; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002724; SSE41-NEXT: por %xmm14, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00002725; SSE41-NEXT: movapd %xmm9, %xmm11
2726; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm11
2727; SSE41-NEXT: movdqa %xmm6, %xmm0
2728; SSE41-NEXT: pxor %xmm10, %xmm0
2729; SSE41-NEXT: movdqa %xmm12, %xmm7
2730; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
2731; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2]
2732; SSE41-NEXT: pcmpeqd %xmm12, %xmm0
2733; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
2734; SSE41-NEXT: pand %xmm13, %xmm14
2735; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
2736; SSE41-NEXT: por %xmm14, %xmm0
2737; SSE41-NEXT: movapd %xmm9, %xmm13
2738; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm13
2739; SSE41-NEXT: movdqa %xmm5, %xmm0
2740; SSE41-NEXT: pxor %xmm10, %xmm0
2741; SSE41-NEXT: movdqa %xmm12, %xmm6
2742; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
2743; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,2,2]
2744; SSE41-NEXT: pcmpeqd %xmm12, %xmm0
2745; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
2746; SSE41-NEXT: pand %xmm14, %xmm7
2747; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
2748; SSE41-NEXT: por %xmm7, %xmm0
2749; SSE41-NEXT: movapd %xmm9, %xmm14
2750; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm14
2751; SSE41-NEXT: movdqa %xmm4, %xmm0
2752; SSE41-NEXT: pxor %xmm10, %xmm0
2753; SSE41-NEXT: movdqa %xmm12, %xmm5
2754; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
2755; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
2756; SSE41-NEXT: pcmpeqd %xmm12, %xmm0
2757; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
2758; SSE41-NEXT: pand %xmm6, %xmm7
2759; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
2760; SSE41-NEXT: por %xmm7, %xmm0
2761; SSE41-NEXT: movapd %xmm9, %xmm15
2762; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm15
2763; SSE41-NEXT: movdqa %xmm3, %xmm0
2764; SSE41-NEXT: pxor %xmm10, %xmm0
2765; SSE41-NEXT: movdqa %xmm12, %xmm4
2766; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
2767; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
2768; SSE41-NEXT: pcmpeqd %xmm12, %xmm0
2769; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
2770; SSE41-NEXT: pand %xmm6, %xmm7
2771; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
2772; SSE41-NEXT: por %xmm7, %xmm0
2773; SSE41-NEXT: movapd %xmm9, %xmm4
2774; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
2775; SSE41-NEXT: movdqa %xmm2, %xmm0
2776; SSE41-NEXT: pxor %xmm10, %xmm0
2777; SSE41-NEXT: movdqa %xmm12, %xmm3
2778; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
2779; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
2780; SSE41-NEXT: pcmpeqd %xmm12, %xmm0
2781; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
2782; SSE41-NEXT: pand %xmm6, %xmm7
2783; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
2784; SSE41-NEXT: por %xmm7, %xmm0
2785; SSE41-NEXT: movapd %xmm9, %xmm6
2786; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002787; SSE41-NEXT: movdqa %xmm1, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00002788; SSE41-NEXT: pxor %xmm10, %xmm0
2789; SSE41-NEXT: movdqa %xmm12, %xmm2
2790; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
2791; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
2792; SSE41-NEXT: pcmpeqd %xmm12, %xmm0
2793; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
2794; SSE41-NEXT: pand %xmm3, %xmm7
2795; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
2796; SSE41-NEXT: por %xmm7, %xmm0
2797; SSE41-NEXT: movapd %xmm9, %xmm7
2798; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002799; SSE41-NEXT: movdqa %xmm8, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00002800; SSE41-NEXT: pxor %xmm10, %xmm0
2801; SSE41-NEXT: movdqa %xmm12, %xmm1
2802; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
2803; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
2804; SSE41-NEXT: pcmpeqd %xmm12, %xmm0
2805; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2806; SSE41-NEXT: pand %xmm2, %xmm3
2807; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
2808; SSE41-NEXT: por %xmm3, %xmm0
2809; SSE41-NEXT: movapd %xmm9, %xmm2
2810; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002811; SSE41-NEXT: movapd %xmm2, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00002812; SSE41-NEXT: xorpd %xmm10, %xmm0
2813; SSE41-NEXT: movapd %xmm0, %xmm1
2814; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
2815; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
2816; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
2817; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2818; SSE41-NEXT: pand %xmm3, %xmm5
2819; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
2820; SSE41-NEXT: por %xmm5, %xmm0
2821; SSE41-NEXT: xorpd %xmm8, %xmm8
2822; SSE41-NEXT: pxor %xmm1, %xmm1
2823; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
2824; SSE41-NEXT: movapd %xmm7, %xmm0
2825; SSE41-NEXT: xorpd %xmm10, %xmm0
2826; SSE41-NEXT: movapd %xmm0, %xmm2
2827; SSE41-NEXT: pcmpgtd %xmm10, %xmm2
2828; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
2829; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
2830; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2831; SSE41-NEXT: pand %xmm5, %xmm3
2832; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
2833; SSE41-NEXT: por %xmm3, %xmm0
2834; SSE41-NEXT: pxor %xmm12, %xmm12
2835; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm12
2836; SSE41-NEXT: movapd %xmm6, %xmm0
2837; SSE41-NEXT: xorpd %xmm10, %xmm0
2838; SSE41-NEXT: movapd %xmm0, %xmm3
2839; SSE41-NEXT: pcmpgtd %xmm10, %xmm3
2840; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
2841; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
2842; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
2843; SSE41-NEXT: pand %xmm5, %xmm7
2844; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
2845; SSE41-NEXT: por %xmm7, %xmm0
2846; SSE41-NEXT: pxor %xmm7, %xmm7
2847; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
2848; SSE41-NEXT: movapd %xmm4, %xmm0
2849; SSE41-NEXT: xorpd %xmm10, %xmm0
2850; SSE41-NEXT: movapd %xmm0, %xmm3
2851; SSE41-NEXT: pcmpgtd %xmm10, %xmm3
2852; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
2853; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
2854; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
2855; SSE41-NEXT: pand %xmm5, %xmm6
2856; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
2857; SSE41-NEXT: por %xmm6, %xmm0
2858; SSE41-NEXT: pxor %xmm6, %xmm6
2859; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002860; SSE41-NEXT: movapd %xmm15, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00002861; SSE41-NEXT: xorpd %xmm10, %xmm0
2862; SSE41-NEXT: movapd %xmm0, %xmm3
2863; SSE41-NEXT: pcmpgtd %xmm10, %xmm3
2864; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
2865; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
2866; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2867; SSE41-NEXT: pand %xmm4, %xmm5
2868; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
2869; SSE41-NEXT: por %xmm5, %xmm0
2870; SSE41-NEXT: pxor %xmm4, %xmm4
2871; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm4
2872; SSE41-NEXT: movapd %xmm14, %xmm0
2873; SSE41-NEXT: xorpd %xmm10, %xmm0
2874; SSE41-NEXT: movapd %xmm0, %xmm3
2875; SSE41-NEXT: pcmpgtd %xmm10, %xmm3
2876; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
2877; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
2878; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
2879; SSE41-NEXT: pand %xmm5, %xmm2
2880; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
2881; SSE41-NEXT: por %xmm2, %xmm0
2882; SSE41-NEXT: xorpd %xmm15, %xmm15
2883; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm15
2884; SSE41-NEXT: movapd %xmm13, %xmm0
2885; SSE41-NEXT: xorpd %xmm10, %xmm0
2886; SSE41-NEXT: movapd %xmm0, %xmm2
2887; SSE41-NEXT: pcmpgtd %xmm10, %xmm2
2888; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,2,2]
2889; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
2890; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2891; SSE41-NEXT: pand %xmm14, %xmm3
2892; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
2893; SSE41-NEXT: por %xmm3, %xmm0
2894; SSE41-NEXT: pxor %xmm2, %xmm2
2895; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002896; SSE41-NEXT: movapd %xmm11, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00002897; SSE41-NEXT: xorpd %xmm10, %xmm0
2898; SSE41-NEXT: movapd %xmm0, %xmm3
2899; SSE41-NEXT: pcmpgtd %xmm10, %xmm3
2900; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,0,2,2]
2901; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
2902; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2903; SSE41-NEXT: pand %xmm13, %xmm5
2904; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
2905; SSE41-NEXT: por %xmm5, %xmm0
2906; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm8
2907; SSE41-NEXT: andpd %xmm9, %xmm8
2908; SSE41-NEXT: andpd %xmm9, %xmm2
2909; SSE41-NEXT: packuswb %xmm8, %xmm2
2910; SSE41-NEXT: andpd %xmm9, %xmm15
2911; SSE41-NEXT: andpd %xmm9, %xmm4
2912; SSE41-NEXT: packuswb %xmm15, %xmm4
2913; SSE41-NEXT: packuswb %xmm2, %xmm4
2914; SSE41-NEXT: andpd %xmm9, %xmm6
2915; SSE41-NEXT: andpd %xmm9, %xmm7
2916; SSE41-NEXT: packuswb %xmm6, %xmm7
2917; SSE41-NEXT: andpd %xmm9, %xmm12
2918; SSE41-NEXT: andpd %xmm9, %xmm1
2919; SSE41-NEXT: packuswb %xmm12, %xmm1
2920; SSE41-NEXT: packuswb %xmm7, %xmm1
2921; SSE41-NEXT: packuswb %xmm4, %xmm1
2922; SSE41-NEXT: movdqa %xmm1, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002923; SSE41-NEXT: retq
2924;
2925; AVX1-LABEL: trunc_packus_v16i64_v16i8:
2926; AVX1: # %bb.0:
Simon Pilgrim0be55672018-02-11 10:52:37 +00002927; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [255,255,255,255]
2928; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002929; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255]
Simon Pilgrim0be55672018-02-11 10:52:37 +00002930; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6
2931; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm7
2932; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
2933; AVX1-NEXT: vblendvpd %ymm6, %ymm3, %ymm5, %ymm3
2934; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
2935; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6
2936; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm7
2937; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
2938; AVX1-NEXT: vblendvpd %ymm6, %ymm2, %ymm5, %ymm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002939; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
2940; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6
2941; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm7
2942; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
Simon Pilgrim0be55672018-02-11 10:52:37 +00002943; AVX1-NEXT: vblendvpd %ymm6, %ymm1, %ymm5, %ymm1
2944; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
2945; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6
2946; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm7
2947; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
2948; AVX1-NEXT: vblendvpd %ymm6, %ymm0, %ymm5, %ymm0
2949; AVX1-NEXT: vxorpd %xmm5, %xmm5, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002950; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
Simon Pilgrim0be55672018-02-11 10:52:37 +00002951; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm7
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002952; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
2953; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm6
Simon Pilgrim0be55672018-02-11 10:52:37 +00002954; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm7
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002955; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2956; AVX1-NEXT: vpand %xmm3, %xmm7, %xmm3
2957; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
2958; AVX1-NEXT: vpackuswb %xmm6, %xmm3, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +00002959; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002960; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
2961; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
Simon Pilgrim0be55672018-02-11 10:52:37 +00002962; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002963; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2964; AVX1-NEXT: vpand %xmm2, %xmm7, %xmm2
2965; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00002966; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002967; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00002968; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002969; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
2970; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +00002971; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm6
2972; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2973; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002974; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
Simon Pilgrim0be55672018-02-11 10:52:37 +00002975; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
2976; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002977; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
Simon Pilgrim0be55672018-02-11 10:52:37 +00002978; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
2979; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002980; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00002981; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002982; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00002983; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002984; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2985; AVX1-NEXT: vzeroupper
2986; AVX1-NEXT: retq
2987;
2988; AVX2-SLOW-LABEL: trunc_packus_v16i64_v16i8:
2989; AVX2-SLOW: # %bb.0:
2990; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
2991; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002992; AVX2-SLOW-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00002993; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5
2994; AVX2-SLOW-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3
2995; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5
2996; AVX2-SLOW-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
2997; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5
2998; AVX2-SLOW-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +00002999; AVX2-SLOW-NEXT: vpxor %xmm4, %xmm4, %xmm4
Simon Pilgrim0be55672018-02-11 10:52:37 +00003000; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5
3001; AVX2-SLOW-NEXT: vpand %ymm1, %ymm5, %ymm1
3002; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5
3003; AVX2-SLOW-NEXT: vpand %ymm0, %ymm5, %ymm0
3004; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm5
3005; AVX2-SLOW-NEXT: vpand %ymm3, %ymm5, %ymm3
3006; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm4
3007; AVX2-SLOW-NEXT: vpand %ymm2, %ymm4, %ymm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003008; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
3009; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3010; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
3011; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
3012; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3013; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3014; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3015; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3016; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3017; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
3018; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3019; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3020; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3021; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3022; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3023; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3024; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3025; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
3026; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3027; AVX2-SLOW-NEXT: vzeroupper
3028; AVX2-SLOW-NEXT: retq
3029;
3030; AVX2-FAST-LABEL: trunc_packus_v16i64_v16i8:
3031; AVX2-FAST: # %bb.0:
3032; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3033; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003034; AVX2-FAST-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
Simon Pilgrim0be55672018-02-11 10:52:37 +00003035; AVX2-FAST-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5
3036; AVX2-FAST-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3
3037; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5
3038; AVX2-FAST-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
3039; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5
3040; AVX2-FAST-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003041; AVX2-FAST-NEXT: vpxor %xmm4, %xmm4, %xmm4
Simon Pilgrim0be55672018-02-11 10:52:37 +00003042; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5
3043; AVX2-FAST-NEXT: vpand %ymm1, %ymm5, %ymm1
3044; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5
3045; AVX2-FAST-NEXT: vpand %ymm0, %ymm5, %ymm0
3046; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm5
3047; AVX2-FAST-NEXT: vpand %ymm3, %ymm5, %ymm3
3048; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm4
3049; AVX2-FAST-NEXT: vpand %ymm2, %ymm4, %ymm2
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003050; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
3051; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
3052; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
3053; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3054; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3055; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3056; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3057; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3058; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
3059; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
3060; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
3061; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3062; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3063; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3064; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
3065; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3066; AVX2-FAST-NEXT: vzeroupper
3067; AVX2-FAST-NEXT: retq
3068;
3069; AVX512-LABEL: trunc_packus_v16i64_v16i8:
3070; AVX512: # %bb.0:
3071; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
3072; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0
3073; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1
3074; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
3075; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
3076; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
3077; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3078; AVX512-NEXT: vpmovqd %zmm1, %ymm1
3079; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3080; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3081; AVX512-NEXT: vzeroupper
3082; AVX512-NEXT: retq
3083 %1 = icmp slt <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
3084 %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
3085 %3 = icmp sgt <16 x i64> %2, zeroinitializer
3086 %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> zeroinitializer
3087 %5 = trunc <16 x i64> %4 to <16 x i8>
3088 ret <16 x i8> %5
3089}
3090
3091define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) {
3092; SSE2-LABEL: trunc_packus_v8i32_v8i8:
3093; SSE2: # %bb.0:
3094; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
3095; SSE2-NEXT: movdqa %xmm2, %xmm3
3096; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003097; SSE2-NEXT: pand %xmm3, %xmm1
3098; SSE2-NEXT: pandn %xmm2, %xmm3
3099; SSE2-NEXT: por %xmm1, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +00003100; SSE2-NEXT: movdqa %xmm2, %xmm1
3101; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
3102; SSE2-NEXT: pand %xmm1, %xmm0
3103; SSE2-NEXT: pandn %xmm2, %xmm1
3104; SSE2-NEXT: por %xmm0, %xmm1
3105; SSE2-NEXT: pxor %xmm2, %xmm2
3106; SSE2-NEXT: movdqa %xmm1, %xmm0
3107; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
3108; SSE2-NEXT: pand %xmm1, %xmm0
3109; SSE2-NEXT: movdqa %xmm3, %xmm1
3110; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
3111; SSE2-NEXT: pand %xmm3, %xmm1
3112; SSE2-NEXT: pslld $16, %xmm1
3113; SSE2-NEXT: psrad $16, %xmm1
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003114; SSE2-NEXT: pslld $16, %xmm0
3115; SSE2-NEXT: psrad $16, %xmm0
Simon Pilgrim0be55672018-02-11 10:52:37 +00003116; SSE2-NEXT: packssdw %xmm1, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003117; SSE2-NEXT: retq
3118;
3119; SSSE3-LABEL: trunc_packus_v8i32_v8i8:
3120; SSSE3: # %bb.0:
3121; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
3122; SSSE3-NEXT: movdqa %xmm2, %xmm3
3123; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003124; SSSE3-NEXT: pand %xmm3, %xmm1
3125; SSSE3-NEXT: pandn %xmm2, %xmm3
3126; SSSE3-NEXT: por %xmm1, %xmm3
Simon Pilgrim0be55672018-02-11 10:52:37 +00003127; SSSE3-NEXT: movdqa %xmm2, %xmm1
3128; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
3129; SSSE3-NEXT: pand %xmm1, %xmm0
3130; SSSE3-NEXT: pandn %xmm2, %xmm1
3131; SSSE3-NEXT: por %xmm0, %xmm1
3132; SSSE3-NEXT: pxor %xmm2, %xmm2
3133; SSSE3-NEXT: movdqa %xmm1, %xmm0
3134; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
3135; SSSE3-NEXT: pand %xmm1, %xmm0
3136; SSSE3-NEXT: movdqa %xmm3, %xmm1
3137; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
3138; SSSE3-NEXT: pand %xmm3, %xmm1
3139; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3140; SSSE3-NEXT: pshufb %xmm2, %xmm1
3141; SSSE3-NEXT: pshufb %xmm2, %xmm0
3142; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003143; SSSE3-NEXT: retq
3144;
3145; SSE41-LABEL: trunc_packus_v8i32_v8i8:
3146; SSE41: # %bb.0:
3147; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
3148; SSE41-NEXT: pminsd %xmm2, %xmm1
3149; SSE41-NEXT: pminsd %xmm2, %xmm0
3150; SSE41-NEXT: pxor %xmm2, %xmm2
3151; SSE41-NEXT: pmaxsd %xmm2, %xmm0
3152; SSE41-NEXT: pmaxsd %xmm2, %xmm1
3153; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3154; SSE41-NEXT: pshufb %xmm2, %xmm1
3155; SSE41-NEXT: pshufb %xmm2, %xmm0
3156; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3157; SSE41-NEXT: retq
3158;
3159; AVX1-LABEL: trunc_packus_v8i32_v8i8:
3160; AVX1: # %bb.0:
3161; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3162; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255]
3163; AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1
3164; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
3165; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
3166; AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
3167; AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1
3168; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3169; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3170; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3171; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3172; AVX1-NEXT: vzeroupper
3173; AVX1-NEXT: retq
3174;
3175; AVX2-LABEL: trunc_packus_v8i32_v8i8:
3176; AVX2: # %bb.0:
3177; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
3178; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
3179; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
3180; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
3181; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3182; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
Puyan Lotfi43e94b12018-01-31 22:04:26 +00003183; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003184; AVX2-NEXT: vzeroupper
3185; AVX2-NEXT: retq
3186;
3187; AVX512F-LABEL: trunc_packus_v8i32_v8i8:
3188; AVX512F: # %bb.0:
3189; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
3190; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
3191; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
3192; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
3193; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
Puyan Lotfi43e94b12018-01-31 22:04:26 +00003194; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003195; AVX512F-NEXT: vzeroupper
3196; AVX512F-NEXT: retq
3197;
3198; AVX512VL-LABEL: trunc_packus_v8i32_v8i8:
3199; AVX512VL: # %bb.0:
3200; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
3201; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3202; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
3203; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
3204; AVX512VL-NEXT: vzeroupper
3205; AVX512VL-NEXT: retq
3206;
3207; AVX512BW-LABEL: trunc_packus_v8i32_v8i8:
3208; AVX512BW: # %bb.0:
3209; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
3210; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
3211; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
3212; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
3213; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
Puyan Lotfi43e94b12018-01-31 22:04:26 +00003214; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003215; AVX512BW-NEXT: vzeroupper
3216; AVX512BW-NEXT: retq
3217;
3218; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8:
3219; AVX512BWVL: # %bb.0:
3220; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
3221; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3222; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
3223; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
3224; AVX512BWVL-NEXT: vzeroupper
3225; AVX512BWVL-NEXT: retq
3226 %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
3227 %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
3228 %3 = icmp sgt <8 x i32> %2, zeroinitializer
3229 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
3230 %5 = trunc <8 x i32> %4 to <8 x i8>
3231 ret <8 x i8> %5
3232}
3233
Simon Pilgrim689d8132018-02-15 17:48:34 +00003234define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
3235; SSE2-LABEL: trunc_packus_v8i32_v8i8_store:
3236; SSE2: # %bb.0:
3237; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
3238; SSE2-NEXT: movdqa %xmm2, %xmm3
3239; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
3240; SSE2-NEXT: pand %xmm3, %xmm1
3241; SSE2-NEXT: pandn %xmm2, %xmm3
3242; SSE2-NEXT: por %xmm1, %xmm3
3243; SSE2-NEXT: movdqa %xmm2, %xmm1
3244; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
3245; SSE2-NEXT: pand %xmm1, %xmm0
3246; SSE2-NEXT: pandn %xmm2, %xmm1
3247; SSE2-NEXT: por %xmm0, %xmm1
3248; SSE2-NEXT: pxor %xmm0, %xmm0
3249; SSE2-NEXT: movdqa %xmm1, %xmm4
3250; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
3251; SSE2-NEXT: movdqa %xmm3, %xmm5
3252; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
3253; SSE2-NEXT: pand %xmm2, %xmm5
3254; SSE2-NEXT: pand %xmm3, %xmm5
3255; SSE2-NEXT: pand %xmm2, %xmm4
3256; SSE2-NEXT: pand %xmm1, %xmm4
3257; SSE2-NEXT: packuswb %xmm5, %xmm4
3258; SSE2-NEXT: packuswb %xmm4, %xmm4
3259; SSE2-NEXT: movq %xmm4, (%rdi)
3260; SSE2-NEXT: retq
3261;
3262; SSSE3-LABEL: trunc_packus_v8i32_v8i8_store:
3263; SSSE3: # %bb.0:
3264; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
3265; SSSE3-NEXT: movdqa %xmm2, %xmm3
3266; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
3267; SSSE3-NEXT: pand %xmm3, %xmm1
3268; SSSE3-NEXT: pandn %xmm2, %xmm3
3269; SSSE3-NEXT: por %xmm1, %xmm3
3270; SSSE3-NEXT: movdqa %xmm2, %xmm1
3271; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
3272; SSSE3-NEXT: pand %xmm1, %xmm0
3273; SSSE3-NEXT: pandn %xmm2, %xmm1
3274; SSSE3-NEXT: por %xmm0, %xmm1
3275; SSSE3-NEXT: pxor %xmm0, %xmm0
3276; SSSE3-NEXT: movdqa %xmm1, %xmm2
3277; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
3278; SSSE3-NEXT: pand %xmm1, %xmm2
3279; SSSE3-NEXT: movdqa %xmm3, %xmm1
3280; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
3281; SSSE3-NEXT: pand %xmm3, %xmm1
3282; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3283; SSSE3-NEXT: pshufb %xmm0, %xmm1
3284; SSSE3-NEXT: pshufb %xmm0, %xmm2
3285; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
3286; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
3287; SSSE3-NEXT: movq %xmm2, (%rdi)
3288; SSSE3-NEXT: retq
3289;
3290; SSE41-LABEL: trunc_packus_v8i32_v8i8_store:
3291; SSE41: # %bb.0:
3292; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
3293; SSE41-NEXT: pminsd %xmm2, %xmm1
3294; SSE41-NEXT: pminsd %xmm2, %xmm0
3295; SSE41-NEXT: pxor %xmm2, %xmm2
3296; SSE41-NEXT: pmaxsd %xmm2, %xmm0
3297; SSE41-NEXT: pmaxsd %xmm2, %xmm1
3298; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3299; SSE41-NEXT: pshufb %xmm2, %xmm1
3300; SSE41-NEXT: pshufb %xmm2, %xmm0
3301; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3302; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
3303; SSE41-NEXT: movq %xmm0, (%rdi)
3304; SSE41-NEXT: retq
3305;
3306; AVX1-LABEL: trunc_packus_v8i32_v8i8_store:
3307; AVX1: # %bb.0:
3308; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3309; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255]
3310; AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1
3311; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
3312; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
3313; AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
3314; AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1
3315; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
3316; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3317; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3318; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3319; AVX1-NEXT: vmovq %xmm0, (%rdi)
3320; AVX1-NEXT: vzeroupper
3321; AVX1-NEXT: retq
3322;
3323; AVX2-LABEL: trunc_packus_v8i32_v8i8_store:
3324; AVX2: # %bb.0:
3325; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
3326; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
3327; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
3328; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
3329; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3330; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3331; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
3332; AVX2-NEXT: vmovq %xmm0, (%rdi)
3333; AVX2-NEXT: vzeroupper
3334; AVX2-NEXT: retq
3335;
3336; AVX512F-LABEL: trunc_packus_v8i32_v8i8_store:
3337; AVX512F: # %bb.0:
3338; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
3339; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
3340; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
3341; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
3342; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
3343; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
3344; AVX512F-NEXT: vmovq %xmm0, (%rdi)
3345; AVX512F-NEXT: vzeroupper
3346; AVX512F-NEXT: retq
3347;
3348; AVX512VL-LABEL: trunc_packus_v8i32_v8i8_store:
3349; AVX512VL: # %bb.0:
3350; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
3351; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3352; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
3353; AVX512VL-NEXT: vpmovdb %ymm0, (%rdi)
3354; AVX512VL-NEXT: vzeroupper
3355; AVX512VL-NEXT: retq
3356;
3357; AVX512BW-LABEL: trunc_packus_v8i32_v8i8_store:
3358; AVX512BW: # %bb.0:
3359; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
3360; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
3361; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
3362; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
3363; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
3364; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
3365; AVX512BW-NEXT: vmovq %xmm0, (%rdi)
3366; AVX512BW-NEXT: vzeroupper
3367; AVX512BW-NEXT: retq
3368;
3369; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8_store:
3370; AVX512BWVL: # %bb.0:
3371; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
3372; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3373; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
3374; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi)
3375; AVX512BWVL-NEXT: vzeroupper
3376; AVX512BWVL-NEXT: retq
3377 %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
3378 %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
3379 %3 = icmp sgt <8 x i32> %2, zeroinitializer
3380 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
3381 %5 = trunc <8 x i32> %4 to <8 x i8>
3382 store <8 x i8> %5, <8 x i8> *%p1
3383 ret void
3384}
3385
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003386define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32> %a0) {
Simon Pilgrim17bb6f02018-02-15 14:37:59 +00003387; SSE-LABEL: trunc_packus_v16i32_v16i8:
3388; SSE: # %bb.0:
3389; SSE-NEXT: packssdw %xmm3, %xmm2
3390; SSE-NEXT: packssdw %xmm1, %xmm0
3391; SSE-NEXT: packuswb %xmm2, %xmm0
3392; SSE-NEXT: retq
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003393;
3394; AVX1-LABEL: trunc_packus_v16i32_v16i8:
3395; AVX1: # %bb.0:
3396; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
Simon Pilgrim17bb6f02018-02-15 14:37:59 +00003397; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
3398; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3399; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003400; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3401; AVX1-NEXT: vzeroupper
3402; AVX1-NEXT: retq
3403;
3404; AVX2-LABEL: trunc_packus_v16i32_v16i8:
3405; AVX2: # %bb.0:
Simon Pilgrim17bb6f02018-02-15 14:37:59 +00003406; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
3407; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3408; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3409; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003410; AVX2-NEXT: vzeroupper
3411; AVX2-NEXT: retq
3412;
3413; AVX512-LABEL: trunc_packus_v16i32_v16i8:
3414; AVX512: # %bb.0:
3415; AVX512-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0
3416; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
3417; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
3418; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3419; AVX512-NEXT: vzeroupper
3420; AVX512-NEXT: retq
3421 %1 = icmp slt <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
3422 %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
3423 %3 = icmp sgt <16 x i32> %2, zeroinitializer
3424 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
3425 %5 = trunc <16 x i32> %4 to <16 x i8>
3426 ret <16 x i8> %5
3427}
3428
3429define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) {
Simon Pilgrim86d15bf2018-02-14 14:14:29 +00003430; SSE-LABEL: trunc_packus_v16i16_v16i8:
3431; SSE: # %bb.0:
3432; SSE-NEXT: packuswb %xmm1, %xmm0
3433; SSE-NEXT: retq
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003434;
3435; AVX1-LABEL: trunc_packus_v16i16_v16i8:
3436; AVX1: # %bb.0:
3437; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
Simon Pilgrimae00a712018-02-06 14:07:46 +00003438; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003439; AVX1-NEXT: vzeroupper
3440; AVX1-NEXT: retq
3441;
3442; AVX2-LABEL: trunc_packus_v16i16_v16i8:
3443; AVX2: # %bb.0:
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003444; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimae00a712018-02-06 14:07:46 +00003445; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003446; AVX2-NEXT: vzeroupper
3447; AVX2-NEXT: retq
3448;
3449; AVX512F-LABEL: trunc_packus_v16i16_v16i8:
3450; AVX512F: # %bb.0:
3451; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
3452; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
3453; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
3454; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
3455; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3456; AVX512F-NEXT: vzeroupper
3457; AVX512F-NEXT: retq
3458;
3459; AVX512VL-LABEL: trunc_packus_v16i16_v16i8:
3460; AVX512VL: # %bb.0:
3461; AVX512VL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
3462; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3463; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
3464; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
3465; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
3466; AVX512VL-NEXT: vzeroupper
3467; AVX512VL-NEXT: retq
3468;
3469; AVX512BW-LABEL: trunc_packus_v16i16_v16i8:
3470; AVX512BW: # %bb.0:
3471; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
3472; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
3473; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
3474; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
Puyan Lotfi43e94b12018-01-31 22:04:26 +00003475; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003476; AVX512BW-NEXT: vzeroupper
3477; AVX512BW-NEXT: retq
3478;
3479; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8:
3480; AVX512BWVL: # %bb.0:
3481; AVX512BWVL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
3482; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3483; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
3484; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
3485; AVX512BWVL-NEXT: vzeroupper
3486; AVX512BWVL-NEXT: retq
3487 %1 = icmp slt <16 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
3488 %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
3489 %3 = icmp sgt <16 x i16> %2, zeroinitializer
3490 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
3491 %5 = trunc <16 x i16> %4 to <16 x i8>
3492 ret <16 x i8> %5
3493}
3494
3495define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16> %a0) {
Simon Pilgrim86d15bf2018-02-14 14:14:29 +00003496; SSE-LABEL: trunc_packus_v32i16_v32i8:
3497; SSE: # %bb.0:
3498; SSE-NEXT: packuswb %xmm1, %xmm0
3499; SSE-NEXT: packuswb %xmm3, %xmm2
3500; SSE-NEXT: movdqa %xmm2, %xmm1
3501; SSE-NEXT: retq
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003502;
3503; AVX1-LABEL: trunc_packus_v32i16_v32i8:
3504; AVX1: # %bb.0:
3505; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
Simon Pilgrimae00a712018-02-06 14:07:46 +00003506; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3507; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3508; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003509; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
3510; AVX1-NEXT: retq
3511;
3512; AVX2-LABEL: trunc_packus_v32i16_v32i8:
3513; AVX2: # %bb.0:
Simon Pilgrim86d15bf2018-02-14 14:14:29 +00003514; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3515; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00003516; AVX2-NEXT: retq
3517;
3518; AVX512F-LABEL: trunc_packus_v32i16_v32i8:
3519; AVX512F: # %bb.0:
3520; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3521; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm0
3522; AVX512F-NEXT: vpminsw %ymm2, %ymm1, %ymm1
3523; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
3524; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1
3525; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
3526; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
3527; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3528; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
3529; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
3530; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3531; AVX512F-NEXT: retq
3532;
3533; AVX512VL-LABEL: trunc_packus_v32i16_v32i8:
3534; AVX512VL: # %bb.0:
3535; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3536; AVX512VL-NEXT: vpminsw %ymm2, %ymm0, %ymm0
3537; AVX512VL-NEXT: vpminsw %ymm2, %ymm1, %ymm1
3538; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
3539; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1
3540; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
3541; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
3542; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
3543; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1
3544; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
3545; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3546; AVX512VL-NEXT: retq
3547;
3548; AVX512BW-LABEL: trunc_packus_v32i16_v32i8:
3549; AVX512BW: # %bb.0:
3550; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %zmm0, %zmm0
3551; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
3552; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
3553; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3554; AVX512BW-NEXT: retq
3555;
3556; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8:
3557; AVX512BWVL: # %bb.0:
3558; AVX512BWVL-NEXT: vpminsw {{.*}}(%rip), %zmm0, %zmm0
3559; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3560; AVX512BWVL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
3561; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
3562; AVX512BWVL-NEXT: retq
3563 %1 = icmp slt <32 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
3564 %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
3565 %3 = icmp sgt <32 x i16> %2, zeroinitializer
3566 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
3567 %5 = trunc <32 x i16> %4 to <32 x i8>
3568 ret <32 x i8> %5
3569}