blob: 912e55490a5b6454bf447524242b0ef65afa8587 [file] [log] [blame]
Simon Pilgrim65ec9232018-01-26 14:58:50 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
12
13;
14; PACKUS saturation truncation to vXi32
15;
16
17define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
18; SSE2-LABEL: trunc_packus_v4i64_v4i32:
19; SSE2: # %bb.0:
20; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
21; SSE2-NEXT: movdqa %xmm0, %xmm3
22; SSE2-NEXT: pxor %xmm2, %xmm3
23; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647]
24; SSE2-NEXT: movdqa %xmm4, %xmm5
25; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
26; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
27; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
28; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
29; SSE2-NEXT: pand %xmm6, %xmm7
30; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
31; SSE2-NEXT: por %xmm7, %xmm3
32; SSE2-NEXT: movdqa %xmm1, %xmm5
33; SSE2-NEXT: pxor %xmm2, %xmm5
34; SSE2-NEXT: movdqa %xmm4, %xmm6
35; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
36; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
37; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
38; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
39; SSE2-NEXT: pand %xmm7, %xmm5
40; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
41; SSE2-NEXT: por %xmm5, %xmm4
42; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295]
43; SSE2-NEXT: pand %xmm4, %xmm1
44; SSE2-NEXT: pandn %xmm5, %xmm4
45; SSE2-NEXT: por %xmm1, %xmm4
46; SSE2-NEXT: pand %xmm3, %xmm0
47; SSE2-NEXT: pandn %xmm5, %xmm3
48; SSE2-NEXT: por %xmm0, %xmm3
49; SSE2-NEXT: movdqa %xmm3, %xmm0
50; SSE2-NEXT: pxor %xmm2, %xmm0
51; SSE2-NEXT: movdqa %xmm0, %xmm1
52; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
53; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
54; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
55; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
56; SSE2-NEXT: pand %xmm5, %xmm6
57; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
58; SSE2-NEXT: por %xmm6, %xmm0
59; SSE2-NEXT: movdqa %xmm4, %xmm1
60; SSE2-NEXT: pxor %xmm2, %xmm1
61; SSE2-NEXT: movdqa %xmm1, %xmm5
62; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
63; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
64; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
65; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
66; SSE2-NEXT: pand %xmm6, %xmm1
67; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
68; SSE2-NEXT: por %xmm1, %xmm2
69; SSE2-NEXT: pand %xmm4, %xmm2
70; SSE2-NEXT: pand %xmm3, %xmm0
71; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
72; SSE2-NEXT: retq
73;
74; SSSE3-LABEL: trunc_packus_v4i64_v4i32:
75; SSSE3: # %bb.0:
76; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
77; SSSE3-NEXT: movdqa %xmm0, %xmm3
78; SSSE3-NEXT: pxor %xmm2, %xmm3
79; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647]
80; SSSE3-NEXT: movdqa %xmm4, %xmm5
81; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
82; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
83; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
84; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
85; SSSE3-NEXT: pand %xmm6, %xmm7
86; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
87; SSSE3-NEXT: por %xmm7, %xmm3
88; SSSE3-NEXT: movdqa %xmm1, %xmm5
89; SSSE3-NEXT: pxor %xmm2, %xmm5
90; SSSE3-NEXT: movdqa %xmm4, %xmm6
91; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
92; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
93; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5
94; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
95; SSSE3-NEXT: pand %xmm7, %xmm5
96; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
97; SSSE3-NEXT: por %xmm5, %xmm4
98; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295]
99; SSSE3-NEXT: pand %xmm4, %xmm1
100; SSSE3-NEXT: pandn %xmm5, %xmm4
101; SSSE3-NEXT: por %xmm1, %xmm4
102; SSSE3-NEXT: pand %xmm3, %xmm0
103; SSSE3-NEXT: pandn %xmm5, %xmm3
104; SSSE3-NEXT: por %xmm0, %xmm3
105; SSSE3-NEXT: movdqa %xmm3, %xmm0
106; SSSE3-NEXT: pxor %xmm2, %xmm0
107; SSSE3-NEXT: movdqa %xmm0, %xmm1
108; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
109; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
110; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
111; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
112; SSSE3-NEXT: pand %xmm5, %xmm6
113; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
114; SSSE3-NEXT: por %xmm6, %xmm0
115; SSSE3-NEXT: movdqa %xmm4, %xmm1
116; SSSE3-NEXT: pxor %xmm2, %xmm1
117; SSSE3-NEXT: movdqa %xmm1, %xmm5
118; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
119; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
120; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
121; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
122; SSSE3-NEXT: pand %xmm6, %xmm1
123; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
124; SSSE3-NEXT: por %xmm1, %xmm2
125; SSSE3-NEXT: pand %xmm4, %xmm2
126; SSSE3-NEXT: pand %xmm3, %xmm0
127; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
128; SSSE3-NEXT: retq
129;
130; SSE41-LABEL: trunc_packus_v4i64_v4i32:
131; SSE41: # %bb.0:
132; SSE41-NEXT: movdqa %xmm0, %xmm2
133; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
134; SSE41-NEXT: pxor %xmm4, %xmm0
135; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
136; SSE41-NEXT: movdqa %xmm5, %xmm3
137; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
138; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
139; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
140; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
141; SSE41-NEXT: pand %xmm6, %xmm0
142; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
143; SSE41-NEXT: por %xmm0, %xmm3
144; SSE41-NEXT: movdqa %xmm1, %xmm0
145; SSE41-NEXT: pxor %xmm4, %xmm0
146; SSE41-NEXT: movdqa %xmm5, %xmm6
147; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
148; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
149; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
150; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
151; SSE41-NEXT: pand %xmm7, %xmm5
152; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
153; SSE41-NEXT: por %xmm5, %xmm0
154; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,4294967295]
155; SSE41-NEXT: movapd %xmm5, %xmm6
156; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
157; SSE41-NEXT: movdqa %xmm3, %xmm0
158; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
159; SSE41-NEXT: movapd %xmm5, %xmm0
160; SSE41-NEXT: xorpd %xmm4, %xmm0
161; SSE41-NEXT: movapd %xmm0, %xmm1
162; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
163; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
164; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
165; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
166; SSE41-NEXT: pand %xmm2, %xmm3
167; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
168; SSE41-NEXT: por %xmm3, %xmm0
169; SSE41-NEXT: movapd %xmm6, %xmm1
170; SSE41-NEXT: xorpd %xmm4, %xmm1
171; SSE41-NEXT: movapd %xmm1, %xmm2
172; SSE41-NEXT: pcmpgtd %xmm4, %xmm2
173; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
174; SSE41-NEXT: pcmpeqd %xmm4, %xmm1
175; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
176; SSE41-NEXT: pand %xmm3, %xmm1
177; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
178; SSE41-NEXT: por %xmm1, %xmm2
179; SSE41-NEXT: pand %xmm6, %xmm2
180; SSE41-NEXT: pand %xmm5, %xmm0
181; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
182; SSE41-NEXT: retq
183;
184; AVX1-LABEL: trunc_packus_v4i64_v4i32:
185; AVX1: # %bb.0:
186; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
187; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967295,4294967295]
188; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
189; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
190; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
191; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
192; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
193; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
194; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
195; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
196; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
197; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
198; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
199; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
200; AVX1-NEXT: vzeroupper
201; AVX1-NEXT: retq
202;
203; AVX2-SLOW-LABEL: trunc_packus_v4i64_v4i32:
204; AVX2-SLOW: # %bb.0:
205; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
206; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
207; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
208; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
209; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
210; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0
211; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
212; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
213; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
214; AVX2-SLOW-NEXT: vzeroupper
215; AVX2-SLOW-NEXT: retq
216;
217; AVX2-FAST-LABEL: trunc_packus_v4i64_v4i32:
218; AVX2-FAST: # %bb.0:
219; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
220; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
221; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
222; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
223; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
224; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0
225; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
226; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
227; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
228; AVX2-FAST-NEXT: vzeroupper
229; AVX2-FAST-NEXT: retq
230;
231; AVX512F-LABEL: trunc_packus_v4i64_v4i32:
232; AVX512F: # %bb.0:
233; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
234; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
235; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
236; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
237; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
238; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
239; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
240; AVX512F-NEXT: vzeroupper
241; AVX512F-NEXT: retq
242;
243; AVX512VL-LABEL: trunc_packus_v4i64_v4i32:
244; AVX512VL: # %bb.0:
245; AVX512VL-NEXT: vpminsq {{.*}}(%rip){1to4}, %ymm0, %ymm0
246; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
247; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
248; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
249; AVX512VL-NEXT: vzeroupper
250; AVX512VL-NEXT: retq
251;
252; AVX512BW-LABEL: trunc_packus_v4i64_v4i32:
253; AVX512BW: # %bb.0:
254; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
255; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
256; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
257; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
258; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
259; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
260; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
261; AVX512BW-NEXT: vzeroupper
262; AVX512BW-NEXT: retq
263;
264; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32:
265; AVX512BWVL: # %bb.0:
266; AVX512BWVL-NEXT: vpminsq {{.*}}(%rip){1to4}, %ymm0, %ymm0
267; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
268; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
269; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
270; AVX512BWVL-NEXT: vzeroupper
271; AVX512BWVL-NEXT: retq
272 %1 = icmp slt <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
273 %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
274 %3 = icmp sgt <4 x i64> %2, zeroinitializer
275 %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer
276 %5 = trunc <4 x i64> %4 to <4 x i32>
277 ret <4 x i32> %5
278}
279
280
281define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
282; SSE2-LABEL: trunc_packus_v8i64_v8i32:
283; SSE2: # %bb.0:
284; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
285; SSE2-NEXT: movdqa %xmm0, %xmm4
286; SSE2-NEXT: pxor %xmm8, %xmm4
287; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
288; SSE2-NEXT: movdqa %xmm9, %xmm6
289; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
290; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
291; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
292; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
293; SSE2-NEXT: pand %xmm7, %xmm5
294; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3]
295; SSE2-NEXT: por %xmm5, %xmm10
296; SSE2-NEXT: movdqa %xmm1, %xmm5
297; SSE2-NEXT: pxor %xmm8, %xmm5
298; SSE2-NEXT: movdqa %xmm9, %xmm6
299; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
300; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
301; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
302; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
303; SSE2-NEXT: pand %xmm7, %xmm5
304; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
305; SSE2-NEXT: por %xmm5, %xmm6
306; SSE2-NEXT: movdqa %xmm2, %xmm5
307; SSE2-NEXT: pxor %xmm8, %xmm5
308; SSE2-NEXT: movdqa %xmm9, %xmm7
309; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
310; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
311; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
312; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
313; SSE2-NEXT: pand %xmm4, %xmm5
314; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
315; SSE2-NEXT: por %xmm5, %xmm7
316; SSE2-NEXT: movdqa %xmm3, %xmm4
317; SSE2-NEXT: pxor %xmm8, %xmm4
318; SSE2-NEXT: movdqa %xmm9, %xmm5
319; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
320; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2]
321; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
322; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
323; SSE2-NEXT: pand %xmm11, %xmm4
324; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
325; SSE2-NEXT: por %xmm4, %xmm5
326; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
327; SSE2-NEXT: pand %xmm5, %xmm3
328; SSE2-NEXT: pandn %xmm4, %xmm5
329; SSE2-NEXT: por %xmm3, %xmm5
330; SSE2-NEXT: pand %xmm7, %xmm2
331; SSE2-NEXT: pandn %xmm4, %xmm7
332; SSE2-NEXT: por %xmm2, %xmm7
333; SSE2-NEXT: pand %xmm6, %xmm1
334; SSE2-NEXT: pandn %xmm4, %xmm6
335; SSE2-NEXT: por %xmm1, %xmm6
336; SSE2-NEXT: pand %xmm10, %xmm0
337; SSE2-NEXT: pandn %xmm4, %xmm10
338; SSE2-NEXT: por %xmm0, %xmm10
339; SSE2-NEXT: movdqa %xmm10, %xmm0
340; SSE2-NEXT: pxor %xmm8, %xmm0
341; SSE2-NEXT: movdqa %xmm0, %xmm1
342; SSE2-NEXT: pcmpgtd %xmm8, %xmm1
343; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
344; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
345; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
346; SSE2-NEXT: pand %xmm2, %xmm3
347; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
348; SSE2-NEXT: por %xmm3, %xmm0
349; SSE2-NEXT: movdqa %xmm6, %xmm1
350; SSE2-NEXT: pxor %xmm8, %xmm1
351; SSE2-NEXT: movdqa %xmm1, %xmm2
352; SSE2-NEXT: pcmpgtd %xmm8, %xmm2
353; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
354; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
355; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
356; SSE2-NEXT: pand %xmm3, %xmm1
357; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
358; SSE2-NEXT: por %xmm1, %xmm2
359; SSE2-NEXT: movdqa %xmm7, %xmm1
360; SSE2-NEXT: pxor %xmm8, %xmm1
361; SSE2-NEXT: movdqa %xmm1, %xmm3
362; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
363; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
364; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
365; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
366; SSE2-NEXT: pand %xmm9, %xmm4
367; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
368; SSE2-NEXT: por %xmm4, %xmm1
369; SSE2-NEXT: movdqa %xmm5, %xmm3
370; SSE2-NEXT: pxor %xmm8, %xmm3
371; SSE2-NEXT: movdqa %xmm3, %xmm4
372; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
373; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
374; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
375; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
376; SSE2-NEXT: pand %xmm9, %xmm3
377; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
378; SSE2-NEXT: por %xmm3, %xmm4
379; SSE2-NEXT: pand %xmm5, %xmm4
380; SSE2-NEXT: pand %xmm7, %xmm1
381; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
382; SSE2-NEXT: pand %xmm6, %xmm2
383; SSE2-NEXT: pand %xmm10, %xmm0
384; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
385; SSE2-NEXT: retq
386;
387; SSSE3-LABEL: trunc_packus_v8i64_v8i32:
388; SSSE3: # %bb.0:
389; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
390; SSSE3-NEXT: movdqa %xmm0, %xmm4
391; SSSE3-NEXT: pxor %xmm8, %xmm4
392; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
393; SSSE3-NEXT: movdqa %xmm9, %xmm6
394; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
395; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
396; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
397; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
398; SSSE3-NEXT: pand %xmm7, %xmm5
399; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3]
400; SSSE3-NEXT: por %xmm5, %xmm10
401; SSSE3-NEXT: movdqa %xmm1, %xmm5
402; SSSE3-NEXT: pxor %xmm8, %xmm5
403; SSSE3-NEXT: movdqa %xmm9, %xmm6
404; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
405; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
406; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
407; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
408; SSSE3-NEXT: pand %xmm7, %xmm5
409; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
410; SSSE3-NEXT: por %xmm5, %xmm6
411; SSSE3-NEXT: movdqa %xmm2, %xmm5
412; SSSE3-NEXT: pxor %xmm8, %xmm5
413; SSSE3-NEXT: movdqa %xmm9, %xmm7
414; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
415; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
416; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
417; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
418; SSSE3-NEXT: pand %xmm4, %xmm5
419; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
420; SSSE3-NEXT: por %xmm5, %xmm7
421; SSSE3-NEXT: movdqa %xmm3, %xmm4
422; SSSE3-NEXT: pxor %xmm8, %xmm4
423; SSSE3-NEXT: movdqa %xmm9, %xmm5
424; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
425; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2]
426; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
427; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
428; SSSE3-NEXT: pand %xmm11, %xmm4
429; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
430; SSSE3-NEXT: por %xmm4, %xmm5
431; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
432; SSSE3-NEXT: pand %xmm5, %xmm3
433; SSSE3-NEXT: pandn %xmm4, %xmm5
434; SSSE3-NEXT: por %xmm3, %xmm5
435; SSSE3-NEXT: pand %xmm7, %xmm2
436; SSSE3-NEXT: pandn %xmm4, %xmm7
437; SSSE3-NEXT: por %xmm2, %xmm7
438; SSSE3-NEXT: pand %xmm6, %xmm1
439; SSSE3-NEXT: pandn %xmm4, %xmm6
440; SSSE3-NEXT: por %xmm1, %xmm6
441; SSSE3-NEXT: pand %xmm10, %xmm0
442; SSSE3-NEXT: pandn %xmm4, %xmm10
443; SSSE3-NEXT: por %xmm0, %xmm10
444; SSSE3-NEXT: movdqa %xmm10, %xmm0
445; SSSE3-NEXT: pxor %xmm8, %xmm0
446; SSSE3-NEXT: movdqa %xmm0, %xmm1
447; SSSE3-NEXT: pcmpgtd %xmm8, %xmm1
448; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
449; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0
450; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
451; SSSE3-NEXT: pand %xmm2, %xmm3
452; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
453; SSSE3-NEXT: por %xmm3, %xmm0
454; SSSE3-NEXT: movdqa %xmm6, %xmm1
455; SSSE3-NEXT: pxor %xmm8, %xmm1
456; SSSE3-NEXT: movdqa %xmm1, %xmm2
457; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2
458; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
459; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1
460; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
461; SSSE3-NEXT: pand %xmm3, %xmm1
462; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
463; SSSE3-NEXT: por %xmm1, %xmm2
464; SSSE3-NEXT: movdqa %xmm7, %xmm1
465; SSSE3-NEXT: pxor %xmm8, %xmm1
466; SSSE3-NEXT: movdqa %xmm1, %xmm3
467; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
468; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
469; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1
470; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
471; SSSE3-NEXT: pand %xmm9, %xmm4
472; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
473; SSSE3-NEXT: por %xmm4, %xmm1
474; SSSE3-NEXT: movdqa %xmm5, %xmm3
475; SSSE3-NEXT: pxor %xmm8, %xmm3
476; SSSE3-NEXT: movdqa %xmm3, %xmm4
477; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
478; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
479; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3
480; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
481; SSSE3-NEXT: pand %xmm9, %xmm3
482; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
483; SSSE3-NEXT: por %xmm3, %xmm4
484; SSSE3-NEXT: pand %xmm5, %xmm4
485; SSSE3-NEXT: pand %xmm7, %xmm1
486; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
487; SSSE3-NEXT: pand %xmm6, %xmm2
488; SSSE3-NEXT: pand %xmm10, %xmm0
489; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
490; SSSE3-NEXT: retq
491;
492; SSE41-LABEL: trunc_packus_v8i64_v8i32:
493; SSE41: # %bb.0:
494; SSE41-NEXT: movdqa %xmm0, %xmm8
495; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,0,2147483648,0]
496; SSE41-NEXT: movdqa %xmm8, %xmm5
497; SSE41-NEXT: pxor %xmm11, %xmm5
498; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483647,2147483647]
499; SSE41-NEXT: movdqa %xmm0, %xmm6
500; SSE41-NEXT: pcmpgtd %xmm5, %xmm6
501; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
502; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
503; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
504; SSE41-NEXT: pand %xmm7, %xmm5
505; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
506; SSE41-NEXT: por %xmm5, %xmm9
507; SSE41-NEXT: movdqa %xmm1, %xmm5
508; SSE41-NEXT: pxor %xmm11, %xmm5
509; SSE41-NEXT: movdqa %xmm0, %xmm6
510; SSE41-NEXT: pcmpgtd %xmm5, %xmm6
511; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
512; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
513; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
514; SSE41-NEXT: pand %xmm7, %xmm5
515; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3]
516; SSE41-NEXT: por %xmm5, %xmm12
517; SSE41-NEXT: movdqa %xmm2, %xmm5
518; SSE41-NEXT: pxor %xmm11, %xmm5
519; SSE41-NEXT: movdqa %xmm0, %xmm7
520; SSE41-NEXT: pcmpgtd %xmm5, %xmm7
521; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
522; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
523; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
524; SSE41-NEXT: pand %xmm4, %xmm5
525; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
526; SSE41-NEXT: por %xmm5, %xmm7
527; SSE41-NEXT: movdqa %xmm3, %xmm4
528; SSE41-NEXT: pxor %xmm11, %xmm4
529; SSE41-NEXT: movdqa %xmm0, %xmm5
530; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
531; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
532; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
533; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
534; SSE41-NEXT: pand %xmm6, %xmm4
535; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
536; SSE41-NEXT: por %xmm4, %xmm0
537; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,4294967295]
538; SSE41-NEXT: movapd %xmm5, %xmm10
539; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10
540; SSE41-NEXT: movapd %xmm5, %xmm3
541; SSE41-NEXT: movdqa %xmm7, %xmm0
542; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
543; SSE41-NEXT: movapd %xmm5, %xmm13
544; SSE41-NEXT: movdqa %xmm12, %xmm0
545; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm13
546; SSE41-NEXT: movdqa %xmm9, %xmm0
547; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
548; SSE41-NEXT: movapd %xmm5, %xmm0
549; SSE41-NEXT: xorpd %xmm11, %xmm0
550; SSE41-NEXT: movapd %xmm0, %xmm1
551; SSE41-NEXT: pcmpgtd %xmm11, %xmm1
552; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
553; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
554; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
555; SSE41-NEXT: pand %xmm4, %xmm6
556; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
557; SSE41-NEXT: por %xmm6, %xmm0
558; SSE41-NEXT: movapd %xmm13, %xmm1
559; SSE41-NEXT: xorpd %xmm11, %xmm1
560; SSE41-NEXT: movapd %xmm1, %xmm4
561; SSE41-NEXT: pcmpgtd %xmm11, %xmm4
562; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
563; SSE41-NEXT: pcmpeqd %xmm11, %xmm1
564; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
565; SSE41-NEXT: pand %xmm6, %xmm1
566; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
567; SSE41-NEXT: por %xmm1, %xmm6
568; SSE41-NEXT: movapd %xmm3, %xmm1
569; SSE41-NEXT: xorpd %xmm11, %xmm1
570; SSE41-NEXT: movapd %xmm1, %xmm4
571; SSE41-NEXT: pcmpgtd %xmm11, %xmm4
572; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
573; SSE41-NEXT: pcmpeqd %xmm11, %xmm1
574; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
575; SSE41-NEXT: pand %xmm7, %xmm2
576; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
577; SSE41-NEXT: por %xmm2, %xmm1
578; SSE41-NEXT: movapd %xmm10, %xmm2
579; SSE41-NEXT: xorpd %xmm11, %xmm2
580; SSE41-NEXT: movapd %xmm2, %xmm4
581; SSE41-NEXT: pcmpgtd %xmm11, %xmm4
582; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
583; SSE41-NEXT: pcmpeqd %xmm11, %xmm2
584; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
585; SSE41-NEXT: pand %xmm7, %xmm2
586; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
587; SSE41-NEXT: por %xmm2, %xmm4
588; SSE41-NEXT: pand %xmm10, %xmm4
589; SSE41-NEXT: pand %xmm3, %xmm1
590; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
591; SSE41-NEXT: pand %xmm13, %xmm6
592; SSE41-NEXT: pand %xmm5, %xmm0
593; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
594; SSE41-NEXT: retq
595;
596; AVX1-LABEL: trunc_packus_v8i64_v8i32:
597; AVX1: # %bb.0:
598; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
599; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4294967295,4294967295]
600; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
601; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
602; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
603; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
604; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm4
605; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3
606; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
607; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [4294967295,4294967295,4294967295,4294967295]
608; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm4, %ymm1
609; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm4, %ymm0
610; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2
611; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
612; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
613; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
614; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm6
615; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
616; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2
617; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
618; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1
619; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
620; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm2
621; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
622; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
623; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
624; AVX1-NEXT: retq
625;
626; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32:
627; AVX2-SLOW: # %bb.0:
628; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
629; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
630; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm4
631; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
632; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
633; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
634; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
635; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
636; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1
637; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0
638; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
639; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
640; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
641; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
642; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
643; AVX2-SLOW-NEXT: retq
644;
645; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32:
646; AVX2-FAST: # %bb.0:
647; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
648; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
649; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm4
650; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
651; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
652; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
653; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
654; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
655; AVX2-FAST-NEXT: vpand %ymm1, %ymm2, %ymm1
656; AVX2-FAST-NEXT: vpand %ymm0, %ymm3, %ymm0
657; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
658; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
659; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
660; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
661; AVX2-FAST-NEXT: retq
662;
663; AVX512-LABEL: trunc_packus_v8i64_v8i32:
664; AVX512: # %bb.0:
665; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
666; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
667; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
668; AVX512-NEXT: vpmovqd %zmm0, %ymm0
669; AVX512-NEXT: retq
670 %1 = icmp slt <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
671 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
672 %3 = icmp sgt <8 x i64> %2, zeroinitializer
673 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
674 %5 = trunc <8 x i64> %4 to <8 x i32>
675 ret <8 x i32> %5
676}
677
678;
679; PACKUS saturation truncation to vXi16
680;
681
682define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) {
683; SSE2-LABEL: trunc_packus_v8i64_v8i16:
684; SSE2: # %bb.0:
685; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
686; SSE2-NEXT: movdqa %xmm1, %xmm4
687; SSE2-NEXT: pxor %xmm8, %xmm4
688; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183]
689; SSE2-NEXT: movdqa %xmm9, %xmm6
690; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
691; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
692; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
693; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
694; SSE2-NEXT: pand %xmm7, %xmm5
695; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
696; SSE2-NEXT: por %xmm5, %xmm11
697; SSE2-NEXT: movdqa %xmm0, %xmm5
698; SSE2-NEXT: pxor %xmm8, %xmm5
699; SSE2-NEXT: movdqa %xmm9, %xmm6
700; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
701; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
702; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
703; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
704; SSE2-NEXT: pand %xmm7, %xmm5
705; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3]
706; SSE2-NEXT: por %xmm5, %xmm12
707; SSE2-NEXT: movdqa %xmm3, %xmm5
708; SSE2-NEXT: pxor %xmm8, %xmm5
709; SSE2-NEXT: movdqa %xmm9, %xmm7
710; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
711; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
712; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
713; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
714; SSE2-NEXT: pand %xmm10, %xmm5
715; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
716; SSE2-NEXT: por %xmm5, %xmm7
717; SSE2-NEXT: movdqa %xmm2, %xmm5
718; SSE2-NEXT: pxor %xmm8, %xmm5
719; SSE2-NEXT: movdqa %xmm9, %xmm4
720; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
721; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
722; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
723; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
724; SSE2-NEXT: pand %xmm10, %xmm6
725; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
726; SSE2-NEXT: por %xmm6, %xmm5
727; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535]
728; SSE2-NEXT: pand %xmm5, %xmm2
729; SSE2-NEXT: pandn %xmm4, %xmm5
730; SSE2-NEXT: por %xmm2, %xmm5
731; SSE2-NEXT: pand %xmm7, %xmm3
732; SSE2-NEXT: pandn %xmm4, %xmm7
733; SSE2-NEXT: por %xmm3, %xmm7
734; SSE2-NEXT: pand %xmm12, %xmm0
735; SSE2-NEXT: pandn %xmm4, %xmm12
736; SSE2-NEXT: por %xmm0, %xmm12
737; SSE2-NEXT: pand %xmm11, %xmm1
738; SSE2-NEXT: pandn %xmm4, %xmm11
739; SSE2-NEXT: por %xmm1, %xmm11
740; SSE2-NEXT: movdqa %xmm11, %xmm0
741; SSE2-NEXT: pxor %xmm8, %xmm0
742; SSE2-NEXT: movdqa %xmm0, %xmm1
743; SSE2-NEXT: pcmpgtd %xmm8, %xmm1
744; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
745; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
746; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
747; SSE2-NEXT: pand %xmm2, %xmm3
748; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
749; SSE2-NEXT: por %xmm3, %xmm0
750; SSE2-NEXT: movdqa %xmm12, %xmm1
751; SSE2-NEXT: pxor %xmm8, %xmm1
752; SSE2-NEXT: movdqa %xmm1, %xmm2
753; SSE2-NEXT: pcmpgtd %xmm8, %xmm2
754; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
755; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
756; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
757; SSE2-NEXT: pand %xmm3, %xmm4
758; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
759; SSE2-NEXT: por %xmm4, %xmm1
760; SSE2-NEXT: movdqa %xmm7, %xmm2
761; SSE2-NEXT: pxor %xmm8, %xmm2
762; SSE2-NEXT: movdqa %xmm2, %xmm3
763; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
764; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
765; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
766; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
767; SSE2-NEXT: pand %xmm4, %xmm2
768; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
769; SSE2-NEXT: por %xmm2, %xmm3
770; SSE2-NEXT: movdqa %xmm5, %xmm2
771; SSE2-NEXT: pxor %xmm8, %xmm2
772; SSE2-NEXT: movdqa %xmm2, %xmm4
773; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
774; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
775; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
776; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
777; SSE2-NEXT: pand %xmm6, %xmm2
778; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
779; SSE2-NEXT: por %xmm2, %xmm4
780; SSE2-NEXT: pand %xmm5, %xmm4
781; SSE2-NEXT: pand %xmm7, %xmm3
782; SSE2-NEXT: pand %xmm12, %xmm1
783; SSE2-NEXT: pand %xmm11, %xmm0
784; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
785; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
786; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
787; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
788; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
789; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
790; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
791; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
792; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
793; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
794; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
795; SSE2-NEXT: retq
796;
797; SSSE3-LABEL: trunc_packus_v8i64_v8i16:
798; SSSE3: # %bb.0:
799; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
800; SSSE3-NEXT: movdqa %xmm1, %xmm4
801; SSSE3-NEXT: pxor %xmm8, %xmm4
802; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183]
803; SSSE3-NEXT: movdqa %xmm9, %xmm6
804; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
805; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
806; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
807; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
808; SSSE3-NEXT: pand %xmm7, %xmm5
809; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
810; SSSE3-NEXT: por %xmm5, %xmm11
811; SSSE3-NEXT: movdqa %xmm0, %xmm5
812; SSSE3-NEXT: pxor %xmm8, %xmm5
813; SSSE3-NEXT: movdqa %xmm9, %xmm6
814; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
815; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
816; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
817; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
818; SSSE3-NEXT: pand %xmm7, %xmm5
819; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3]
820; SSSE3-NEXT: por %xmm5, %xmm12
821; SSSE3-NEXT: movdqa %xmm3, %xmm5
822; SSSE3-NEXT: pxor %xmm8, %xmm5
823; SSSE3-NEXT: movdqa %xmm9, %xmm7
824; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
825; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
826; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
827; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
828; SSSE3-NEXT: pand %xmm10, %xmm5
829; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
830; SSSE3-NEXT: por %xmm5, %xmm7
831; SSSE3-NEXT: movdqa %xmm2, %xmm5
832; SSSE3-NEXT: pxor %xmm8, %xmm5
833; SSSE3-NEXT: movdqa %xmm9, %xmm4
834; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
835; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
836; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
837; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
838; SSSE3-NEXT: pand %xmm10, %xmm6
839; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
840; SSSE3-NEXT: por %xmm6, %xmm5
841; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535]
842; SSSE3-NEXT: pand %xmm5, %xmm2
843; SSSE3-NEXT: pandn %xmm4, %xmm5
844; SSSE3-NEXT: por %xmm2, %xmm5
845; SSSE3-NEXT: pand %xmm7, %xmm3
846; SSSE3-NEXT: pandn %xmm4, %xmm7
847; SSSE3-NEXT: por %xmm3, %xmm7
848; SSSE3-NEXT: pand %xmm12, %xmm0
849; SSSE3-NEXT: pandn %xmm4, %xmm12
850; SSSE3-NEXT: por %xmm0, %xmm12
851; SSSE3-NEXT: pand %xmm11, %xmm1
852; SSSE3-NEXT: pandn %xmm4, %xmm11
853; SSSE3-NEXT: por %xmm1, %xmm11
854; SSSE3-NEXT: movdqa %xmm11, %xmm0
855; SSSE3-NEXT: pxor %xmm8, %xmm0
856; SSSE3-NEXT: movdqa %xmm0, %xmm1
857; SSSE3-NEXT: pcmpgtd %xmm8, %xmm1
858; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
859; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0
860; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
861; SSSE3-NEXT: pand %xmm2, %xmm3
862; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
863; SSSE3-NEXT: por %xmm3, %xmm0
864; SSSE3-NEXT: movdqa %xmm12, %xmm1
865; SSSE3-NEXT: pxor %xmm8, %xmm1
866; SSSE3-NEXT: movdqa %xmm1, %xmm2
867; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2
868; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
869; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1
870; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
871; SSSE3-NEXT: pand %xmm3, %xmm4
872; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
873; SSSE3-NEXT: por %xmm4, %xmm1
874; SSSE3-NEXT: movdqa %xmm7, %xmm2
875; SSSE3-NEXT: pxor %xmm8, %xmm2
876; SSSE3-NEXT: movdqa %xmm2, %xmm3
877; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
878; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
879; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
880; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
881; SSSE3-NEXT: pand %xmm4, %xmm2
882; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
883; SSSE3-NEXT: por %xmm2, %xmm3
884; SSSE3-NEXT: movdqa %xmm5, %xmm2
885; SSSE3-NEXT: pxor %xmm8, %xmm2
886; SSSE3-NEXT: movdqa %xmm2, %xmm4
887; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
888; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
889; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
890; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
891; SSSE3-NEXT: pand %xmm6, %xmm2
892; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
893; SSSE3-NEXT: por %xmm2, %xmm4
894; SSSE3-NEXT: pand %xmm5, %xmm4
895; SSSE3-NEXT: pand %xmm7, %xmm3
896; SSSE3-NEXT: pand %xmm12, %xmm1
897; SSSE3-NEXT: pand %xmm11, %xmm0
898; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
899; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
900; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
901; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
902; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
903; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
904; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
905; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
906; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
907; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
908; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
909; SSSE3-NEXT: retq
910;
911; SSE41-LABEL: trunc_packus_v8i64_v8i16:
912; SSE41: # %bb.0:
913; SSE41-NEXT: movdqa %xmm0, %xmm8
914; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,0,2147483648,0]
915; SSE41-NEXT: movdqa %xmm3, %xmm5
916; SSE41-NEXT: pxor %xmm11, %xmm5
917; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147549183,2147549183]
918; SSE41-NEXT: movdqa %xmm0, %xmm6
919; SSE41-NEXT: pcmpgtd %xmm5, %xmm6
920; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
921; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
922; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
923; SSE41-NEXT: pand %xmm7, %xmm5
924; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
925; SSE41-NEXT: por %xmm5, %xmm9
926; SSE41-NEXT: movdqa %xmm2, %xmm5
927; SSE41-NEXT: pxor %xmm11, %xmm5
928; SSE41-NEXT: movdqa %xmm0, %xmm6
929; SSE41-NEXT: pcmpgtd %xmm5, %xmm6
930; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
931; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
932; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
933; SSE41-NEXT: pand %xmm7, %xmm5
934; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3]
935; SSE41-NEXT: por %xmm5, %xmm12
936; SSE41-NEXT: movdqa %xmm1, %xmm5
937; SSE41-NEXT: pxor %xmm11, %xmm5
938; SSE41-NEXT: movdqa %xmm0, %xmm7
939; SSE41-NEXT: pcmpgtd %xmm5, %xmm7
940; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
941; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
942; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
943; SSE41-NEXT: pand %xmm4, %xmm5
944; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
945; SSE41-NEXT: por %xmm5, %xmm7
946; SSE41-NEXT: movdqa %xmm8, %xmm4
947; SSE41-NEXT: pxor %xmm11, %xmm4
948; SSE41-NEXT: movdqa %xmm0, %xmm5
949; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
950; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
951; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
952; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
953; SSE41-NEXT: pand %xmm6, %xmm4
954; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
955; SSE41-NEXT: por %xmm4, %xmm0
956; SSE41-NEXT: movapd {{.*#+}} xmm5 = [65535,65535]
957; SSE41-NEXT: movapd %xmm5, %xmm10
958; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm10
959; SSE41-NEXT: movapd %xmm5, %xmm8
960; SSE41-NEXT: movdqa %xmm7, %xmm0
961; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8
962; SSE41-NEXT: movapd %xmm5, %xmm13
963; SSE41-NEXT: movdqa %xmm12, %xmm0
964; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm13
965; SSE41-NEXT: movdqa %xmm9, %xmm0
966; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
967; SSE41-NEXT: movapd %xmm5, %xmm0
968; SSE41-NEXT: xorpd %xmm11, %xmm0
969; SSE41-NEXT: movapd %xmm0, %xmm2
970; SSE41-NEXT: pcmpgtd %xmm11, %xmm2
971; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
972; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
973; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
974; SSE41-NEXT: pand %xmm3, %xmm0
975; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
976; SSE41-NEXT: por %xmm0, %xmm2
977; SSE41-NEXT: movapd %xmm13, %xmm0
978; SSE41-NEXT: xorpd %xmm11, %xmm0
979; SSE41-NEXT: movapd %xmm0, %xmm3
980; SSE41-NEXT: pcmpgtd %xmm11, %xmm3
981; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
982; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
983; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
984; SSE41-NEXT: pand %xmm4, %xmm0
985; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
986; SSE41-NEXT: por %xmm0, %xmm3
987; SSE41-NEXT: movapd %xmm8, %xmm0
988; SSE41-NEXT: xorpd %xmm11, %xmm0
989; SSE41-NEXT: movapd %xmm0, %xmm4
990; SSE41-NEXT: pcmpgtd %xmm11, %xmm4
991; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
992; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
993; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
994; SSE41-NEXT: pand %xmm6, %xmm0
995; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
996; SSE41-NEXT: por %xmm0, %xmm6
997; SSE41-NEXT: movapd %xmm10, %xmm0
998; SSE41-NEXT: xorpd %xmm11, %xmm0
999; SSE41-NEXT: movapd %xmm0, %xmm4
1000; SSE41-NEXT: pcmpgtd %xmm11, %xmm4
1001; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
1002; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
1003; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1004; SSE41-NEXT: pand %xmm7, %xmm1
1005; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
1006; SSE41-NEXT: por %xmm1, %xmm0
1007; SSE41-NEXT: pand %xmm10, %xmm0
1008; SSE41-NEXT: pand %xmm8, %xmm6
1009; SSE41-NEXT: pand %xmm13, %xmm3
1010; SSE41-NEXT: pand %xmm5, %xmm2
1011; SSE41-NEXT: pxor %xmm1, %xmm1
1012; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7]
1013; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7]
1014; SSE41-NEXT: packusdw %xmm2, %xmm3
1015; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0],xmm1[1,2,3],xmm6[4],xmm1[5,6,7]
1016; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1017; SSE41-NEXT: packusdw %xmm6, %xmm0
1018; SSE41-NEXT: packusdw %xmm3, %xmm0
1019; SSE41-NEXT: retq
1020;
1021; AVX1-LABEL: trunc_packus_v8i64_v8i16:
1022; AVX1: # %bb.0:
1023; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1024; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535]
1025; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1026; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
1027; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
1028; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1029; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm4
1030; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3
1031; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
1032; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [65535,65535,65535,65535]
1033; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm4, %ymm1
1034; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm4, %ymm0
1035; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2
1036; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm8
1037; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
1038; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
1039; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm6
1040; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
1041; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm3
1042; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1043; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7]
1044; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1
1045; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1046; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1047; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
1048; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7]
1049; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0
1050; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1051; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1052; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1053; AVX1-NEXT: vzeroupper
1054; AVX1-NEXT: retq
1055;
1056; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i16:
1057; AVX2-SLOW: # %bb.0:
1058; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535]
1059; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
1060; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm4
1061; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
1062; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
1063; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
1064; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
1065; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
1066; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1
1067; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0
1068; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1069; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1070; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1071; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1072; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1073; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1074; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1075; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
1076; AVX2-SLOW-NEXT: vzeroupper
1077; AVX2-SLOW-NEXT: retq
1078;
1079; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i16:
1080; AVX2-FAST: # %bb.0:
1081; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535]
1082; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
1083; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm4
1084; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
1085; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
1086; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
1087; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
1088; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
1089; AVX2-FAST-NEXT: vpand %ymm1, %ymm2, %ymm1
1090; AVX2-FAST-NEXT: vpand %ymm0, %ymm3, %ymm0
1091; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1092; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
1093; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
1094; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1095; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1096; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1097; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
1098; AVX2-FAST-NEXT: vzeroupper
1099; AVX2-FAST-NEXT: retq
1100;
1101; AVX512-LABEL: trunc_packus_v8i64_v8i16:
1102; AVX512: # %bb.0:
1103; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
1104; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1105; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
1106; AVX512-NEXT: vpmovqw %zmm0, %xmm0
1107; AVX512-NEXT: vzeroupper
1108; AVX512-NEXT: retq
1109 %1 = icmp slt <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
1110 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
1111 %3 = icmp sgt <8 x i64> %2, zeroinitializer
1112 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1113 %5 = trunc <8 x i64> %4 to <8 x i16>
1114 ret <8 x i16> %5
1115}
1116
1117define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) {
1118; SSE2-LABEL: trunc_packus_v8i32_v8i16:
1119; SSE2: # %bb.0:
1120; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1121; SSE2-NEXT: movdqa %xmm2, %xmm3
1122; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
1123; SSE2-NEXT: movdqa %xmm2, %xmm4
1124; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
1125; SSE2-NEXT: pand %xmm4, %xmm0
1126; SSE2-NEXT: pandn %xmm2, %xmm4
1127; SSE2-NEXT: por %xmm0, %xmm4
1128; SSE2-NEXT: pand %xmm3, %xmm1
1129; SSE2-NEXT: pandn %xmm2, %xmm3
1130; SSE2-NEXT: por %xmm1, %xmm3
1131; SSE2-NEXT: pxor %xmm1, %xmm1
1132; SSE2-NEXT: movdqa %xmm3, %xmm2
1133; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
1134; SSE2-NEXT: movdqa %xmm4, %xmm0
1135; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
1136; SSE2-NEXT: pand %xmm4, %xmm0
1137; SSE2-NEXT: pand %xmm3, %xmm2
1138; SSE2-NEXT: pslld $16, %xmm2
1139; SSE2-NEXT: psrad $16, %xmm2
1140; SSE2-NEXT: pslld $16, %xmm0
1141; SSE2-NEXT: psrad $16, %xmm0
1142; SSE2-NEXT: packssdw %xmm2, %xmm0
1143; SSE2-NEXT: retq
1144;
1145; SSSE3-LABEL: trunc_packus_v8i32_v8i16:
1146; SSSE3: # %bb.0:
1147; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1148; SSSE3-NEXT: movdqa %xmm2, %xmm3
1149; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
1150; SSSE3-NEXT: movdqa %xmm2, %xmm4
1151; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
1152; SSSE3-NEXT: pand %xmm4, %xmm0
1153; SSSE3-NEXT: pandn %xmm2, %xmm4
1154; SSSE3-NEXT: por %xmm0, %xmm4
1155; SSSE3-NEXT: pand %xmm3, %xmm1
1156; SSSE3-NEXT: pandn %xmm2, %xmm3
1157; SSSE3-NEXT: por %xmm1, %xmm3
1158; SSSE3-NEXT: pxor %xmm1, %xmm1
1159; SSSE3-NEXT: movdqa %xmm3, %xmm2
1160; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
1161; SSSE3-NEXT: movdqa %xmm4, %xmm0
1162; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
1163; SSSE3-NEXT: pand %xmm4, %xmm0
1164; SSSE3-NEXT: pand %xmm3, %xmm2
1165; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1166; SSSE3-NEXT: pshufb %xmm1, %xmm2
1167; SSSE3-NEXT: pshufb %xmm1, %xmm0
1168; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1169; SSSE3-NEXT: retq
1170;
1171; SSE41-LABEL: trunc_packus_v8i32_v8i16:
1172; SSE41: # %bb.0:
1173; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1174; SSE41-NEXT: pminsd %xmm2, %xmm1
1175; SSE41-NEXT: pminsd %xmm2, %xmm0
1176; SSE41-NEXT: pxor %xmm2, %xmm2
1177; SSE41-NEXT: pmaxsd %xmm2, %xmm0
1178; SSE41-NEXT: pmaxsd %xmm2, %xmm1
1179; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1180; SSE41-NEXT: pshufb %xmm2, %xmm1
1181; SSE41-NEXT: pshufb %xmm2, %xmm0
1182; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1183; SSE41-NEXT: retq
1184;
1185; AVX1-LABEL: trunc_packus_v8i32_v8i16:
1186; AVX1: # %bb.0:
1187; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1188; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1189; AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1
1190; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
1191; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1192; AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
1193; AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1
1194; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1195; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1196; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1197; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1198; AVX1-NEXT: vzeroupper
1199; AVX1-NEXT: retq
1200;
1201; AVX2-LABEL: trunc_packus_v8i32_v8i16:
1202; AVX2: # %bb.0:
1203; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
1204; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
1205; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1206; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1207; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1208; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1209; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
1210; AVX2-NEXT: vzeroupper
1211; AVX2-NEXT: retq
1212;
1213; AVX512F-LABEL: trunc_packus_v8i32_v8i16:
1214; AVX512F: # %bb.0:
1215; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
1216; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
1217; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
1218; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1219; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
1220; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
1221; AVX512F-NEXT: vzeroupper
1222; AVX512F-NEXT: retq
1223;
1224; AVX512VL-LABEL: trunc_packus_v8i32_v8i16:
1225; AVX512VL: # %bb.0:
1226; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
1227; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1228; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1229; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
1230; AVX512VL-NEXT: vzeroupper
1231; AVX512VL-NEXT: retq
1232;
1233; AVX512BW-LABEL: trunc_packus_v8i32_v8i16:
1234; AVX512BW: # %bb.0:
1235; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
1236; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
1237; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
1238; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1239; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
1240; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
1241; AVX512BW-NEXT: vzeroupper
1242; AVX512BW-NEXT: retq
1243;
1244; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16:
1245; AVX512BWVL: # %bb.0:
1246; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
1247; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1248; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1249; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
1250; AVX512BWVL-NEXT: vzeroupper
1251; AVX512BWVL-NEXT: retq
1252 %1 = icmp slt <8 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1253 %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1254 %3 = icmp sgt <8 x i32> %2, zeroinitializer
1255 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
1256 %5 = trunc <8 x i32> %4 to <8 x i16>
1257 ret <8 x i16> %5
1258}
1259
1260define <16 x i16> @trunc_packus_v16i32_v16i16(<16 x i32> %a0) {
1261; SSE2-LABEL: trunc_packus_v16i32_v16i16:
1262; SSE2: # %bb.0:
1263; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535]
1264; SSE2-NEXT: movdqa %xmm8, %xmm9
1265; SSE2-NEXT: pcmpgtd %xmm1, %xmm9
1266; SSE2-NEXT: movdqa %xmm8, %xmm5
1267; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
1268; SSE2-NEXT: movdqa %xmm8, %xmm4
1269; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
1270; SSE2-NEXT: movdqa %xmm8, %xmm7
1271; SSE2-NEXT: pcmpgtd %xmm2, %xmm7
1272; SSE2-NEXT: pand %xmm7, %xmm2
1273; SSE2-NEXT: pandn %xmm8, %xmm7
1274; SSE2-NEXT: por %xmm2, %xmm7
1275; SSE2-NEXT: pand %xmm4, %xmm3
1276; SSE2-NEXT: pandn %xmm8, %xmm4
1277; SSE2-NEXT: por %xmm3, %xmm4
1278; SSE2-NEXT: pand %xmm5, %xmm0
1279; SSE2-NEXT: pandn %xmm8, %xmm5
1280; SSE2-NEXT: por %xmm0, %xmm5
1281; SSE2-NEXT: pand %xmm9, %xmm1
1282; SSE2-NEXT: pandn %xmm8, %xmm9
1283; SSE2-NEXT: por %xmm1, %xmm9
1284; SSE2-NEXT: pxor %xmm2, %xmm2
1285; SSE2-NEXT: movdqa %xmm9, %xmm3
1286; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
1287; SSE2-NEXT: movdqa %xmm5, %xmm0
1288; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
1289; SSE2-NEXT: movdqa %xmm4, %xmm6
1290; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
1291; SSE2-NEXT: movdqa %xmm7, %xmm1
1292; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
1293; SSE2-NEXT: pand %xmm7, %xmm1
1294; SSE2-NEXT: pand %xmm4, %xmm6
1295; SSE2-NEXT: pand %xmm5, %xmm0
1296; SSE2-NEXT: pand %xmm9, %xmm3
1297; SSE2-NEXT: pslld $16, %xmm3
1298; SSE2-NEXT: psrad $16, %xmm3
1299; SSE2-NEXT: pslld $16, %xmm0
1300; SSE2-NEXT: psrad $16, %xmm0
1301; SSE2-NEXT: packssdw %xmm3, %xmm0
1302; SSE2-NEXT: pslld $16, %xmm6
1303; SSE2-NEXT: psrad $16, %xmm6
1304; SSE2-NEXT: pslld $16, %xmm1
1305; SSE2-NEXT: psrad $16, %xmm1
1306; SSE2-NEXT: packssdw %xmm6, %xmm1
1307; SSE2-NEXT: retq
1308;
1309; SSSE3-LABEL: trunc_packus_v16i32_v16i16:
1310; SSSE3: # %bb.0:
1311; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535]
1312; SSSE3-NEXT: movdqa %xmm8, %xmm9
1313; SSSE3-NEXT: pcmpgtd %xmm1, %xmm9
1314; SSSE3-NEXT: movdqa %xmm8, %xmm5
1315; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
1316; SSSE3-NEXT: movdqa %xmm8, %xmm4
1317; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
1318; SSSE3-NEXT: movdqa %xmm8, %xmm7
1319; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7
1320; SSSE3-NEXT: pand %xmm7, %xmm2
1321; SSSE3-NEXT: pandn %xmm8, %xmm7
1322; SSSE3-NEXT: por %xmm2, %xmm7
1323; SSSE3-NEXT: pand %xmm4, %xmm3
1324; SSSE3-NEXT: pandn %xmm8, %xmm4
1325; SSSE3-NEXT: por %xmm3, %xmm4
1326; SSSE3-NEXT: pand %xmm5, %xmm0
1327; SSSE3-NEXT: pandn %xmm8, %xmm5
1328; SSSE3-NEXT: por %xmm0, %xmm5
1329; SSSE3-NEXT: pand %xmm9, %xmm1
1330; SSSE3-NEXT: pandn %xmm8, %xmm9
1331; SSSE3-NEXT: por %xmm1, %xmm9
1332; SSSE3-NEXT: pxor %xmm2, %xmm2
1333; SSSE3-NEXT: movdqa %xmm9, %xmm3
1334; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
1335; SSSE3-NEXT: movdqa %xmm5, %xmm0
1336; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
1337; SSSE3-NEXT: movdqa %xmm4, %xmm6
1338; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
1339; SSSE3-NEXT: movdqa %xmm7, %xmm1
1340; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
1341; SSSE3-NEXT: pand %xmm7, %xmm1
1342; SSSE3-NEXT: pand %xmm4, %xmm6
1343; SSSE3-NEXT: pand %xmm5, %xmm0
1344; SSSE3-NEXT: pand %xmm9, %xmm3
1345; SSSE3-NEXT: pslld $16, %xmm3
1346; SSSE3-NEXT: psrad $16, %xmm3
1347; SSSE3-NEXT: pslld $16, %xmm0
1348; SSSE3-NEXT: psrad $16, %xmm0
1349; SSSE3-NEXT: packssdw %xmm3, %xmm0
1350; SSSE3-NEXT: pslld $16, %xmm6
1351; SSSE3-NEXT: psrad $16, %xmm6
1352; SSSE3-NEXT: pslld $16, %xmm1
1353; SSSE3-NEXT: psrad $16, %xmm1
1354; SSSE3-NEXT: packssdw %xmm6, %xmm1
1355; SSSE3-NEXT: retq
1356;
1357; SSE41-LABEL: trunc_packus_v16i32_v16i16:
1358; SSE41: # %bb.0:
1359; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
1360; SSE41-NEXT: pminsd %xmm4, %xmm1
1361; SSE41-NEXT: pminsd %xmm4, %xmm0
1362; SSE41-NEXT: pminsd %xmm4, %xmm3
1363; SSE41-NEXT: pminsd %xmm2, %xmm4
1364; SSE41-NEXT: pxor %xmm2, %xmm2
1365; SSE41-NEXT: pmaxsd %xmm2, %xmm4
1366; SSE41-NEXT: pmaxsd %xmm2, %xmm3
1367; SSE41-NEXT: pmaxsd %xmm2, %xmm0
1368; SSE41-NEXT: pmaxsd %xmm2, %xmm1
1369; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
1370; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1371; SSE41-NEXT: packusdw %xmm1, %xmm0
1372; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
1373; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4],xmm2[5],xmm4[6],xmm2[7]
1374; SSE41-NEXT: packusdw %xmm3, %xmm4
1375; SSE41-NEXT: movdqa %xmm4, %xmm1
1376; SSE41-NEXT: retq
1377;
1378; AVX1-LABEL: trunc_packus_v16i32_v16i16:
1379; AVX1: # %bb.0:
1380; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1381; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1382; AVX1-NEXT: vpminsd %xmm3, %xmm2, %xmm2
1383; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm1
1384; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
1385; AVX1-NEXT: vpminsd %xmm3, %xmm4, %xmm4
1386; AVX1-NEXT: vpminsd %xmm3, %xmm0, %xmm0
1387; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1388; AVX1-NEXT: vpmaxsd %xmm3, %xmm0, %xmm0
1389; AVX1-NEXT: vpmaxsd %xmm3, %xmm4, %xmm4
1390; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm1
1391; AVX1-NEXT: vpmaxsd %xmm3, %xmm2, %xmm2
1392; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
1393; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
1394; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1395; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
1396; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
1397; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1398; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1399; AVX1-NEXT: retq
1400;
1401; AVX2-LABEL: trunc_packus_v16i32_v16i16:
1402; AVX2: # %bb.0:
1403; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
1404; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0
1405; AVX2-NEXT: vpminsd %ymm2, %ymm1, %ymm1
1406; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1407; AVX2-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1
1408; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0
1409; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1410; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1411; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1412; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1413; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1414; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1415; AVX2-NEXT: retq
1416;
1417; AVX512-LABEL: trunc_packus_v16i32_v16i16:
1418; AVX512: # %bb.0:
1419; AVX512-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0
1420; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1421; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
1422; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1423; AVX512-NEXT: retq
1424 %1 = icmp slt <16 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1425 %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1426 %3 = icmp sgt <16 x i32> %2, zeroinitializer
1427 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1428 %5 = trunc <16 x i32> %4 to <16 x i16>
1429 ret <16 x i16> %5
1430}
1431
1432;
1433; PACKUS saturation truncation to v16i8
1434;
1435
1436define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) {
1437; SSE2-LABEL: trunc_packus_v8i64_v8i8:
1438; SSE2: # %bb.0:
1439; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
1440; SSE2-NEXT: movdqa %xmm1, %xmm4
1441; SSE2-NEXT: pxor %xmm8, %xmm4
1442; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
1443; SSE2-NEXT: movdqa %xmm9, %xmm6
1444; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
1445; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1446; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
1447; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
1448; SSE2-NEXT: pand %xmm7, %xmm5
1449; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
1450; SSE2-NEXT: por %xmm5, %xmm11
1451; SSE2-NEXT: movdqa %xmm0, %xmm5
1452; SSE2-NEXT: pxor %xmm8, %xmm5
1453; SSE2-NEXT: movdqa %xmm9, %xmm6
1454; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
1455; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1456; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
1457; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1458; SSE2-NEXT: pand %xmm7, %xmm5
1459; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3]
1460; SSE2-NEXT: por %xmm5, %xmm12
1461; SSE2-NEXT: movdqa %xmm3, %xmm5
1462; SSE2-NEXT: pxor %xmm8, %xmm5
1463; SSE2-NEXT: movdqa %xmm9, %xmm7
1464; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
1465; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
1466; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
1467; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1468; SSE2-NEXT: pand %xmm10, %xmm5
1469; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
1470; SSE2-NEXT: por %xmm5, %xmm7
1471; SSE2-NEXT: movdqa %xmm2, %xmm5
1472; SSE2-NEXT: pxor %xmm8, %xmm5
1473; SSE2-NEXT: movdqa %xmm9, %xmm4
1474; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
1475; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
1476; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
1477; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
1478; SSE2-NEXT: pand %xmm10, %xmm6
1479; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
1480; SSE2-NEXT: por %xmm6, %xmm5
1481; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255]
1482; SSE2-NEXT: pand %xmm5, %xmm2
1483; SSE2-NEXT: pandn %xmm4, %xmm5
1484; SSE2-NEXT: por %xmm2, %xmm5
1485; SSE2-NEXT: pand %xmm7, %xmm3
1486; SSE2-NEXT: pandn %xmm4, %xmm7
1487; SSE2-NEXT: por %xmm3, %xmm7
1488; SSE2-NEXT: pand %xmm12, %xmm0
1489; SSE2-NEXT: pandn %xmm4, %xmm12
1490; SSE2-NEXT: por %xmm0, %xmm12
1491; SSE2-NEXT: pand %xmm11, %xmm1
1492; SSE2-NEXT: pandn %xmm4, %xmm11
1493; SSE2-NEXT: por %xmm1, %xmm11
1494; SSE2-NEXT: movdqa %xmm11, %xmm0
1495; SSE2-NEXT: pxor %xmm8, %xmm0
1496; SSE2-NEXT: movdqa %xmm0, %xmm1
1497; SSE2-NEXT: pcmpgtd %xmm8, %xmm1
1498; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
1499; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
1500; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1501; SSE2-NEXT: pand %xmm2, %xmm3
1502; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
1503; SSE2-NEXT: por %xmm3, %xmm0
1504; SSE2-NEXT: movdqa %xmm12, %xmm1
1505; SSE2-NEXT: pxor %xmm8, %xmm1
1506; SSE2-NEXT: movdqa %xmm1, %xmm2
1507; SSE2-NEXT: pcmpgtd %xmm8, %xmm2
1508; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
1509; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
1510; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1511; SSE2-NEXT: pand %xmm3, %xmm4
1512; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
1513; SSE2-NEXT: por %xmm4, %xmm1
1514; SSE2-NEXT: movdqa %xmm7, %xmm2
1515; SSE2-NEXT: pxor %xmm8, %xmm2
1516; SSE2-NEXT: movdqa %xmm2, %xmm3
1517; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
1518; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
1519; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
1520; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1521; SSE2-NEXT: pand %xmm4, %xmm2
1522; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1523; SSE2-NEXT: por %xmm2, %xmm3
1524; SSE2-NEXT: movdqa %xmm5, %xmm2
1525; SSE2-NEXT: pxor %xmm8, %xmm2
1526; SSE2-NEXT: movdqa %xmm2, %xmm4
1527; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
1528; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
1529; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
1530; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1531; SSE2-NEXT: pand %xmm6, %xmm2
1532; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1533; SSE2-NEXT: por %xmm2, %xmm4
1534; SSE2-NEXT: pand %xmm5, %xmm4
1535; SSE2-NEXT: pand %xmm7, %xmm3
1536; SSE2-NEXT: pand %xmm12, %xmm1
1537; SSE2-NEXT: pand %xmm11, %xmm0
1538; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1539; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1540; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1541; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1542; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1543; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1544; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
1545; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
1546; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1547; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1548; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1549; SSE2-NEXT: retq
1550;
1551; SSSE3-LABEL: trunc_packus_v8i64_v8i8:
1552; SSSE3: # %bb.0:
1553; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
1554; SSSE3-NEXT: movdqa %xmm1, %xmm4
1555; SSSE3-NEXT: pxor %xmm8, %xmm4
1556; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
1557; SSSE3-NEXT: movdqa %xmm9, %xmm6
1558; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
1559; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1560; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
1561; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
1562; SSSE3-NEXT: pand %xmm7, %xmm5
1563; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
1564; SSSE3-NEXT: por %xmm5, %xmm11
1565; SSSE3-NEXT: movdqa %xmm0, %xmm5
1566; SSSE3-NEXT: pxor %xmm8, %xmm5
1567; SSSE3-NEXT: movdqa %xmm9, %xmm6
1568; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
1569; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1570; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
1571; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1572; SSSE3-NEXT: pand %xmm7, %xmm5
1573; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3]
1574; SSSE3-NEXT: por %xmm5, %xmm12
1575; SSSE3-NEXT: movdqa %xmm3, %xmm5
1576; SSSE3-NEXT: pxor %xmm8, %xmm5
1577; SSSE3-NEXT: movdqa %xmm9, %xmm7
1578; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
1579; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
1580; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
1581; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1582; SSSE3-NEXT: pand %xmm10, %xmm5
1583; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
1584; SSSE3-NEXT: por %xmm5, %xmm7
1585; SSSE3-NEXT: movdqa %xmm2, %xmm5
1586; SSSE3-NEXT: pxor %xmm8, %xmm5
1587; SSSE3-NEXT: movdqa %xmm9, %xmm4
1588; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
1589; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
1590; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
1591; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
1592; SSSE3-NEXT: pand %xmm10, %xmm6
1593; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
1594; SSSE3-NEXT: por %xmm6, %xmm5
1595; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255]
1596; SSSE3-NEXT: pand %xmm5, %xmm2
1597; SSSE3-NEXT: pandn %xmm4, %xmm5
1598; SSSE3-NEXT: por %xmm2, %xmm5
1599; SSSE3-NEXT: pand %xmm7, %xmm3
1600; SSSE3-NEXT: pandn %xmm4, %xmm7
1601; SSSE3-NEXT: por %xmm3, %xmm7
1602; SSSE3-NEXT: pand %xmm12, %xmm0
1603; SSSE3-NEXT: pandn %xmm4, %xmm12
1604; SSSE3-NEXT: por %xmm0, %xmm12
1605; SSSE3-NEXT: pand %xmm11, %xmm1
1606; SSSE3-NEXT: pandn %xmm4, %xmm11
1607; SSSE3-NEXT: por %xmm1, %xmm11
1608; SSSE3-NEXT: movdqa %xmm11, %xmm0
1609; SSSE3-NEXT: pxor %xmm8, %xmm0
1610; SSSE3-NEXT: movdqa %xmm0, %xmm1
1611; SSSE3-NEXT: pcmpgtd %xmm8, %xmm1
1612; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
1613; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0
1614; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1615; SSSE3-NEXT: pand %xmm2, %xmm3
1616; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
1617; SSSE3-NEXT: por %xmm3, %xmm0
1618; SSSE3-NEXT: movdqa %xmm12, %xmm1
1619; SSSE3-NEXT: pxor %xmm8, %xmm1
1620; SSSE3-NEXT: movdqa %xmm1, %xmm2
1621; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2
1622; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
1623; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1
1624; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1625; SSSE3-NEXT: pand %xmm3, %xmm4
1626; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
1627; SSSE3-NEXT: por %xmm4, %xmm1
1628; SSSE3-NEXT: movdqa %xmm7, %xmm2
1629; SSSE3-NEXT: pxor %xmm8, %xmm2
1630; SSSE3-NEXT: movdqa %xmm2, %xmm3
1631; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
1632; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
1633; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
1634; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1635; SSSE3-NEXT: pand %xmm4, %xmm2
1636; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1637; SSSE3-NEXT: por %xmm2, %xmm3
1638; SSSE3-NEXT: movdqa %xmm5, %xmm2
1639; SSSE3-NEXT: pxor %xmm8, %xmm2
1640; SSSE3-NEXT: movdqa %xmm2, %xmm4
1641; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
1642; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
1643; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
1644; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1645; SSSE3-NEXT: pand %xmm6, %xmm2
1646; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1647; SSSE3-NEXT: por %xmm2, %xmm4
1648; SSSE3-NEXT: pand %xmm5, %xmm4
1649; SSSE3-NEXT: pand %xmm7, %xmm3
1650; SSSE3-NEXT: pand %xmm12, %xmm1
1651; SSSE3-NEXT: pand %xmm11, %xmm0
1652; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1653; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1654; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1655; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1656; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1657; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1658; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
1659; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
1660; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1661; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1662; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1663; SSSE3-NEXT: retq
1664;
1665; SSE41-LABEL: trunc_packus_v8i64_v8i8:
1666; SSE41: # %bb.0:
1667; SSE41-NEXT: movdqa %xmm0, %xmm8
1668; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,0,2147483648,0]
1669; SSE41-NEXT: movdqa %xmm3, %xmm5
1670; SSE41-NEXT: pxor %xmm11, %xmm5
1671; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903]
1672; SSE41-NEXT: movdqa %xmm0, %xmm6
1673; SSE41-NEXT: pcmpgtd %xmm5, %xmm6
1674; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1675; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
1676; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1677; SSE41-NEXT: pand %xmm7, %xmm5
1678; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
1679; SSE41-NEXT: por %xmm5, %xmm9
1680; SSE41-NEXT: movdqa %xmm2, %xmm5
1681; SSE41-NEXT: pxor %xmm11, %xmm5
1682; SSE41-NEXT: movdqa %xmm0, %xmm6
1683; SSE41-NEXT: pcmpgtd %xmm5, %xmm6
1684; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1685; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
1686; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1687; SSE41-NEXT: pand %xmm7, %xmm5
1688; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3]
1689; SSE41-NEXT: por %xmm5, %xmm12
1690; SSE41-NEXT: movdqa %xmm1, %xmm5
1691; SSE41-NEXT: pxor %xmm11, %xmm5
1692; SSE41-NEXT: movdqa %xmm0, %xmm7
1693; SSE41-NEXT: pcmpgtd %xmm5, %xmm7
1694; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
1695; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
1696; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1697; SSE41-NEXT: pand %xmm4, %xmm5
1698; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
1699; SSE41-NEXT: por %xmm5, %xmm7
1700; SSE41-NEXT: movdqa %xmm8, %xmm4
1701; SSE41-NEXT: pxor %xmm11, %xmm4
1702; SSE41-NEXT: movdqa %xmm0, %xmm5
1703; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
1704; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
1705; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
1706; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1707; SSE41-NEXT: pand %xmm6, %xmm4
1708; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
1709; SSE41-NEXT: por %xmm4, %xmm0
1710; SSE41-NEXT: movapd {{.*#+}} xmm5 = [255,255]
1711; SSE41-NEXT: movapd %xmm5, %xmm10
1712; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm10
1713; SSE41-NEXT: movapd %xmm5, %xmm8
1714; SSE41-NEXT: movdqa %xmm7, %xmm0
1715; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8
1716; SSE41-NEXT: movapd %xmm5, %xmm13
1717; SSE41-NEXT: movdqa %xmm12, %xmm0
1718; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm13
1719; SSE41-NEXT: movdqa %xmm9, %xmm0
1720; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
1721; SSE41-NEXT: movapd %xmm5, %xmm0
1722; SSE41-NEXT: xorpd %xmm11, %xmm0
1723; SSE41-NEXT: movapd %xmm0, %xmm2
1724; SSE41-NEXT: pcmpgtd %xmm11, %xmm2
1725; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
1726; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
1727; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1728; SSE41-NEXT: pand %xmm3, %xmm0
1729; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1730; SSE41-NEXT: por %xmm0, %xmm2
1731; SSE41-NEXT: movapd %xmm13, %xmm0
1732; SSE41-NEXT: xorpd %xmm11, %xmm0
1733; SSE41-NEXT: movapd %xmm0, %xmm3
1734; SSE41-NEXT: pcmpgtd %xmm11, %xmm3
1735; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
1736; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
1737; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1738; SSE41-NEXT: pand %xmm4, %xmm0
1739; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1740; SSE41-NEXT: por %xmm0, %xmm3
1741; SSE41-NEXT: movapd %xmm8, %xmm0
1742; SSE41-NEXT: xorpd %xmm11, %xmm0
1743; SSE41-NEXT: movapd %xmm0, %xmm4
1744; SSE41-NEXT: pcmpgtd %xmm11, %xmm4
1745; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
1746; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
1747; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1748; SSE41-NEXT: pand %xmm6, %xmm0
1749; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
1750; SSE41-NEXT: por %xmm0, %xmm6
1751; SSE41-NEXT: movapd %xmm10, %xmm0
1752; SSE41-NEXT: xorpd %xmm11, %xmm0
1753; SSE41-NEXT: movapd %xmm0, %xmm4
1754; SSE41-NEXT: pcmpgtd %xmm11, %xmm4
1755; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
1756; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
1757; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1758; SSE41-NEXT: pand %xmm7, %xmm1
1759; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
1760; SSE41-NEXT: por %xmm1, %xmm0
1761; SSE41-NEXT: pand %xmm10, %xmm0
1762; SSE41-NEXT: pand %xmm8, %xmm6
1763; SSE41-NEXT: pand %xmm13, %xmm3
1764; SSE41-NEXT: pand %xmm5, %xmm2
1765; SSE41-NEXT: pxor %xmm1, %xmm1
1766; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7]
1767; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7]
1768; SSE41-NEXT: packusdw %xmm2, %xmm3
1769; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0],xmm1[1,2,3],xmm6[4],xmm1[5,6,7]
1770; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1771; SSE41-NEXT: packusdw %xmm6, %xmm0
1772; SSE41-NEXT: packusdw %xmm3, %xmm0
1773; SSE41-NEXT: retq
1774;
1775; AVX1-LABEL: trunc_packus_v8i64_v8i8:
1776; AVX1: # %bb.0:
1777; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1778; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255]
1779; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1780; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
1781; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
1782; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1783; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm4
1784; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3
1785; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
1786; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [255,255,255,255]
1787; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm4, %ymm1
1788; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm4, %ymm0
1789; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2
1790; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm8
1791; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
1792; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
1793; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm6
1794; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
1795; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm3
1796; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1797; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7]
1798; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1
1799; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
1800; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1801; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
1802; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7]
1803; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0
1804; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1805; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1806; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1807; AVX1-NEXT: vzeroupper
1808; AVX1-NEXT: retq
1809;
1810; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i8:
1811; AVX2-SLOW: # %bb.0:
1812; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
1813; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
1814; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm4
1815; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
1816; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
1817; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
1818; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
1819; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
1820; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1
1821; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0
1822; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1823; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1824; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1825; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1826; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1827; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1828; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1829; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
1830; AVX2-SLOW-NEXT: vzeroupper
1831; AVX2-SLOW-NEXT: retq
1832;
1833; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i8:
1834; AVX2-FAST: # %bb.0:
1835; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
1836; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
1837; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm4
1838; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
1839; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
1840; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
1841; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
1842; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
1843; AVX2-FAST-NEXT: vpand %ymm1, %ymm2, %ymm1
1844; AVX2-FAST-NEXT: vpand %ymm0, %ymm3, %ymm0
1845; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1846; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
1847; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
1848; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1849; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1850; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1851; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
1852; AVX2-FAST-NEXT: vzeroupper
1853; AVX2-FAST-NEXT: retq
1854;
1855; AVX512-LABEL: trunc_packus_v8i64_v8i8:
1856; AVX512: # %bb.0:
1857; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
1858; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1859; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
1860; AVX512-NEXT: vpmovqw %zmm0, %xmm0
1861; AVX512-NEXT: vzeroupper
1862; AVX512-NEXT: retq
1863 %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
1864 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
1865 %3 = icmp sgt <8 x i64> %2, zeroinitializer
1866 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1867 %5 = trunc <8 x i64> %4 to <8 x i8>
1868 ret <8 x i8> %5
1869}
1870
1871define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
1872; SSE2-LABEL: trunc_packus_v16i64_v16i8:
1873; SSE2: # %bb.0:
1874; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
1875; SSE2-NEXT: movdqa %xmm6, %xmm10
1876; SSE2-NEXT: movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill
1877; SSE2-NEXT: movdqa %xmm5, %xmm11
1878; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
1879; SSE2-NEXT: movdqa %xmm4, %xmm6
1880; SSE2-NEXT: movdqa %xmm3, %xmm5
1881; SSE2-NEXT: movdqa %xmm1, %xmm3
1882; SSE2-NEXT: movdqa %xmm0, %xmm8
1883; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147483648,0,2147483648,0]
1884; SSE2-NEXT: movdqa %xmm7, %xmm1
1885; SSE2-NEXT: pxor %xmm12, %xmm1
1886; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903]
1887; SSE2-NEXT: movdqa %xmm0, %xmm4
1888; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
1889; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
1890; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
1891; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1892; SSE2-NEXT: pand %xmm9, %xmm1
1893; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
1894; SSE2-NEXT: por %xmm1, %xmm9
1895; SSE2-NEXT: movdqa %xmm10, %xmm1
1896; SSE2-NEXT: pxor %xmm12, %xmm1
1897; SSE2-NEXT: movdqa %xmm0, %xmm4
1898; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
1899; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
1900; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
1901; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1902; SSE2-NEXT: pand %xmm10, %xmm1
1903; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,3,3]
1904; SSE2-NEXT: por %xmm1, %xmm10
1905; SSE2-NEXT: movdqa %xmm11, %xmm1
1906; SSE2-NEXT: pxor %xmm12, %xmm1
1907; SSE2-NEXT: movdqa %xmm0, %xmm4
1908; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
1909; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
1910; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
1911; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1912; SSE2-NEXT: pand %xmm11, %xmm1
1913; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3]
1914; SSE2-NEXT: por %xmm1, %xmm13
1915; SSE2-NEXT: movdqa %xmm6, %xmm1
1916; SSE2-NEXT: pxor %xmm12, %xmm1
1917; SSE2-NEXT: movdqa %xmm0, %xmm4
1918; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
1919; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
1920; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
1921; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1922; SSE2-NEXT: pand %xmm11, %xmm1
1923; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,3,3]
1924; SSE2-NEXT: por %xmm1, %xmm11
1925; SSE2-NEXT: movdqa %xmm5, %xmm1
1926; SSE2-NEXT: pxor %xmm12, %xmm1
1927; SSE2-NEXT: movdqa %xmm0, %xmm4
1928; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
1929; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
1930; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1931; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
1932; SSE2-NEXT: pand %xmm7, %xmm1
1933; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3]
1934; SSE2-NEXT: por %xmm1, %xmm14
1935; SSE2-NEXT: movdqa %xmm2, %xmm1
1936; SSE2-NEXT: pxor %xmm12, %xmm1
1937; SSE2-NEXT: movdqa %xmm0, %xmm4
1938; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
1939; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
1940; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1941; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
1942; SSE2-NEXT: pand %xmm7, %xmm1
1943; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
1944; SSE2-NEXT: por %xmm1, %xmm15
1945; SSE2-NEXT: movdqa %xmm3, %xmm1
1946; SSE2-NEXT: pxor %xmm12, %xmm1
1947; SSE2-NEXT: movdqa %xmm0, %xmm4
1948; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
1949; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
1950; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
1951; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
1952; SSE2-NEXT: pand %xmm1, %xmm7
1953; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
1954; SSE2-NEXT: por %xmm7, %xmm1
1955; SSE2-NEXT: movdqa %xmm8, %xmm4
1956; SSE2-NEXT: pxor %xmm12, %xmm4
1957; SSE2-NEXT: movdqa %xmm0, %xmm7
1958; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
1959; SSE2-NEXT: pcmpeqd %xmm0, %xmm4
1960; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1961; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
1962; SSE2-NEXT: pand %xmm0, %xmm4
1963; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
1964; SSE2-NEXT: por %xmm4, %xmm0
1965; SSE2-NEXT: pand %xmm0, %xmm8
1966; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255]
1967; SSE2-NEXT: pandn %xmm4, %xmm0
1968; SSE2-NEXT: por %xmm8, %xmm0
1969; SSE2-NEXT: pand %xmm1, %xmm3
1970; SSE2-NEXT: pandn %xmm4, %xmm1
1971; SSE2-NEXT: por %xmm3, %xmm1
1972; SSE2-NEXT: pand %xmm15, %xmm2
1973; SSE2-NEXT: pandn %xmm4, %xmm15
1974; SSE2-NEXT: por %xmm2, %xmm15
1975; SSE2-NEXT: pand %xmm14, %xmm5
1976; SSE2-NEXT: pandn %xmm4, %xmm14
1977; SSE2-NEXT: por %xmm5, %xmm14
1978; SSE2-NEXT: pand %xmm11, %xmm6
1979; SSE2-NEXT: pandn %xmm4, %xmm11
1980; SSE2-NEXT: por %xmm6, %xmm11
1981; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
1982; SSE2-NEXT: pand %xmm13, %xmm2
1983; SSE2-NEXT: pandn %xmm4, %xmm13
1984; SSE2-NEXT: por %xmm2, %xmm13
1985; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
1986; SSE2-NEXT: pand %xmm10, %xmm2
1987; SSE2-NEXT: pandn %xmm4, %xmm10
1988; SSE2-NEXT: por %xmm2, %xmm10
1989; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
1990; SSE2-NEXT: pand %xmm9, %xmm2
1991; SSE2-NEXT: pandn %xmm4, %xmm9
1992; SSE2-NEXT: por %xmm2, %xmm9
1993; SSE2-NEXT: movdqa %xmm9, %xmm2
1994; SSE2-NEXT: pxor %xmm12, %xmm2
1995; SSE2-NEXT: movdqa %xmm2, %xmm3
1996; SSE2-NEXT: pcmpgtd %xmm12, %xmm3
1997; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
1998; SSE2-NEXT: pcmpeqd %xmm12, %xmm2
1999; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2000; SSE2-NEXT: pand %xmm5, %xmm2
2001; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2002; SSE2-NEXT: por %xmm2, %xmm3
2003; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
2004; SSE2-NEXT: movdqa %xmm10, %xmm2
2005; SSE2-NEXT: pxor %xmm12, %xmm2
2006; SSE2-NEXT: movdqa %xmm2, %xmm3
2007; SSE2-NEXT: pcmpgtd %xmm12, %xmm3
2008; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
2009; SSE2-NEXT: pcmpeqd %xmm12, %xmm2
2010; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2011; SSE2-NEXT: pand %xmm5, %xmm2
2012; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2013; SSE2-NEXT: por %xmm2, %xmm3
2014; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
2015; SSE2-NEXT: movdqa %xmm13, %xmm2
2016; SSE2-NEXT: pxor %xmm12, %xmm2
2017; SSE2-NEXT: movdqa %xmm2, %xmm3
2018; SSE2-NEXT: pcmpgtd %xmm12, %xmm3
2019; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
2020; SSE2-NEXT: pcmpeqd %xmm12, %xmm2
2021; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2022; SSE2-NEXT: pand %xmm5, %xmm2
2023; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2024; SSE2-NEXT: por %xmm2, %xmm3
2025; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
2026; SSE2-NEXT: movdqa %xmm11, %xmm2
2027; SSE2-NEXT: pxor %xmm12, %xmm2
2028; SSE2-NEXT: movdqa %xmm2, %xmm3
2029; SSE2-NEXT: pcmpgtd %xmm12, %xmm3
2030; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
2031; SSE2-NEXT: pcmpeqd %xmm12, %xmm2
2032; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2033; SSE2-NEXT: pand %xmm6, %xmm2
2034; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
2035; SSE2-NEXT: por %xmm2, %xmm6
2036; SSE2-NEXT: movdqa %xmm14, %xmm2
2037; SSE2-NEXT: pxor %xmm12, %xmm2
2038; SSE2-NEXT: movdqa %xmm2, %xmm3
2039; SSE2-NEXT: pcmpgtd %xmm12, %xmm3
2040; SSE2-NEXT: pcmpeqd %xmm12, %xmm2
2041; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2042; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
2043; SSE2-NEXT: pand %xmm7, %xmm2
2044; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
2045; SSE2-NEXT: por %xmm2, %xmm7
2046; SSE2-NEXT: movdqa %xmm15, %xmm2
2047; SSE2-NEXT: pxor %xmm12, %xmm2
2048; SSE2-NEXT: movdqa %xmm2, %xmm3
2049; SSE2-NEXT: pcmpgtd %xmm12, %xmm3
2050; SSE2-NEXT: pcmpeqd %xmm12, %xmm2
2051; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2052; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
2053; SSE2-NEXT: pand %xmm8, %xmm2
2054; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3]
2055; SSE2-NEXT: por %xmm2, %xmm8
2056; SSE2-NEXT: movdqa %xmm1, %xmm2
2057; SSE2-NEXT: pxor %xmm12, %xmm2
2058; SSE2-NEXT: movdqa %xmm2, %xmm3
2059; SSE2-NEXT: pcmpgtd %xmm12, %xmm3
2060; SSE2-NEXT: pcmpeqd %xmm12, %xmm2
2061; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2062; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
2063; SSE2-NEXT: pand %xmm5, %xmm2
2064; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2065; SSE2-NEXT: por %xmm2, %xmm3
2066; SSE2-NEXT: movdqa %xmm0, %xmm2
2067; SSE2-NEXT: pxor %xmm12, %xmm2
2068; SSE2-NEXT: movdqa %xmm2, %xmm5
2069; SSE2-NEXT: pcmpgtd %xmm12, %xmm5
2070; SSE2-NEXT: pcmpeqd %xmm12, %xmm2
2071; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2072; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
2073; SSE2-NEXT: pand %xmm4, %xmm2
2074; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3]
2075; SSE2-NEXT: por %xmm2, %xmm12
2076; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
2077; SSE2-NEXT: pand %xmm2, %xmm9
2078; SSE2-NEXT: pand -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
2079; SSE2-NEXT: pand %xmm2, %xmm10
2080; SSE2-NEXT: pand -{{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
2081; SSE2-NEXT: packuswb %xmm9, %xmm10
2082; SSE2-NEXT: pand %xmm2, %xmm13
2083; SSE2-NEXT: pand -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
2084; SSE2-NEXT: pand %xmm2, %xmm11
2085; SSE2-NEXT: pand %xmm6, %xmm11
2086; SSE2-NEXT: packuswb %xmm13, %xmm11
2087; SSE2-NEXT: packuswb %xmm10, %xmm11
2088; SSE2-NEXT: pand %xmm2, %xmm14
2089; SSE2-NEXT: pand %xmm7, %xmm14
2090; SSE2-NEXT: pand %xmm2, %xmm15
2091; SSE2-NEXT: pand %xmm8, %xmm15
2092; SSE2-NEXT: packuswb %xmm14, %xmm15
2093; SSE2-NEXT: pand %xmm2, %xmm1
2094; SSE2-NEXT: pand %xmm3, %xmm1
2095; SSE2-NEXT: pand %xmm2, %xmm0
2096; SSE2-NEXT: pand %xmm12, %xmm0
2097; SSE2-NEXT: packuswb %xmm1, %xmm0
2098; SSE2-NEXT: packuswb %xmm15, %xmm0
2099; SSE2-NEXT: packuswb %xmm11, %xmm0
2100; SSE2-NEXT: retq
2101;
2102; SSSE3-LABEL: trunc_packus_v16i64_v16i8:
2103; SSSE3: # %bb.0:
2104; SSSE3-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
2105; SSSE3-NEXT: movdqa %xmm6, %xmm10
2106; SSSE3-NEXT: movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill
2107; SSSE3-NEXT: movdqa %xmm5, %xmm11
2108; SSSE3-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
2109; SSSE3-NEXT: movdqa %xmm4, %xmm6
2110; SSSE3-NEXT: movdqa %xmm3, %xmm5
2111; SSSE3-NEXT: movdqa %xmm1, %xmm3
2112; SSSE3-NEXT: movdqa %xmm0, %xmm8
2113; SSSE3-NEXT: movdqa {{.*#+}} xmm12 = [2147483648,0,2147483648,0]
2114; SSSE3-NEXT: movdqa %xmm7, %xmm1
2115; SSSE3-NEXT: pxor %xmm12, %xmm1
2116; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903]
2117; SSSE3-NEXT: movdqa %xmm0, %xmm4
2118; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
2119; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
2120; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
2121; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2122; SSSE3-NEXT: pand %xmm9, %xmm1
2123; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
2124; SSSE3-NEXT: por %xmm1, %xmm9
2125; SSSE3-NEXT: movdqa %xmm10, %xmm1
2126; SSSE3-NEXT: pxor %xmm12, %xmm1
2127; SSSE3-NEXT: movdqa %xmm0, %xmm4
2128; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
2129; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
2130; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
2131; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2132; SSSE3-NEXT: pand %xmm10, %xmm1
2133; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,3,3]
2134; SSSE3-NEXT: por %xmm1, %xmm10
2135; SSSE3-NEXT: movdqa %xmm11, %xmm1
2136; SSSE3-NEXT: pxor %xmm12, %xmm1
2137; SSSE3-NEXT: movdqa %xmm0, %xmm4
2138; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
2139; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
2140; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
2141; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2142; SSSE3-NEXT: pand %xmm11, %xmm1
2143; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3]
2144; SSSE3-NEXT: por %xmm1, %xmm13
2145; SSSE3-NEXT: movdqa %xmm6, %xmm1
2146; SSSE3-NEXT: pxor %xmm12, %xmm1
2147; SSSE3-NEXT: movdqa %xmm0, %xmm4
2148; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
2149; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
2150; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
2151; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2152; SSSE3-NEXT: pand %xmm11, %xmm1
2153; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,3,3]
2154; SSSE3-NEXT: por %xmm1, %xmm11
2155; SSSE3-NEXT: movdqa %xmm5, %xmm1
2156; SSSE3-NEXT: pxor %xmm12, %xmm1
2157; SSSE3-NEXT: movdqa %xmm0, %xmm4
2158; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
2159; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
2160; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2161; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
2162; SSSE3-NEXT: pand %xmm7, %xmm1
2163; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3]
2164; SSSE3-NEXT: por %xmm1, %xmm14
2165; SSSE3-NEXT: movdqa %xmm2, %xmm1
2166; SSSE3-NEXT: pxor %xmm12, %xmm1
2167; SSSE3-NEXT: movdqa %xmm0, %xmm4
2168; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
2169; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
2170; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2171; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
2172; SSSE3-NEXT: pand %xmm7, %xmm1
2173; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
2174; SSSE3-NEXT: por %xmm1, %xmm15
2175; SSSE3-NEXT: movdqa %xmm3, %xmm1
2176; SSSE3-NEXT: pxor %xmm12, %xmm1
2177; SSSE3-NEXT: movdqa %xmm0, %xmm4
2178; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
2179; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
2180; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
2181; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
2182; SSSE3-NEXT: pand %xmm1, %xmm7
2183; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
2184; SSSE3-NEXT: por %xmm7, %xmm1
2185; SSSE3-NEXT: movdqa %xmm8, %xmm4
2186; SSSE3-NEXT: pxor %xmm12, %xmm4
2187; SSSE3-NEXT: movdqa %xmm0, %xmm7
2188; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
2189; SSSE3-NEXT: pcmpeqd %xmm0, %xmm4
2190; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2191; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
2192; SSSE3-NEXT: pand %xmm0, %xmm4
2193; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
2194; SSSE3-NEXT: por %xmm4, %xmm0
2195; SSSE3-NEXT: pand %xmm0, %xmm8
2196; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255]
2197; SSSE3-NEXT: pandn %xmm4, %xmm0
2198; SSSE3-NEXT: por %xmm8, %xmm0
2199; SSSE3-NEXT: pand %xmm1, %xmm3
2200; SSSE3-NEXT: pandn %xmm4, %xmm1
2201; SSSE3-NEXT: por %xmm3, %xmm1
2202; SSSE3-NEXT: pand %xmm15, %xmm2
2203; SSSE3-NEXT: pandn %xmm4, %xmm15
2204; SSSE3-NEXT: por %xmm2, %xmm15
2205; SSSE3-NEXT: pand %xmm14, %xmm5
2206; SSSE3-NEXT: pandn %xmm4, %xmm14
2207; SSSE3-NEXT: por %xmm5, %xmm14
2208; SSSE3-NEXT: pand %xmm11, %xmm6
2209; SSSE3-NEXT: pandn %xmm4, %xmm11
2210; SSSE3-NEXT: por %xmm6, %xmm11
2211; SSSE3-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
2212; SSSE3-NEXT: pand %xmm13, %xmm2
2213; SSSE3-NEXT: pandn %xmm4, %xmm13
2214; SSSE3-NEXT: por %xmm2, %xmm13
2215; SSSE3-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
2216; SSSE3-NEXT: pand %xmm10, %xmm2
2217; SSSE3-NEXT: pandn %xmm4, %xmm10
2218; SSSE3-NEXT: por %xmm2, %xmm10
2219; SSSE3-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
2220; SSSE3-NEXT: pand %xmm9, %xmm2
2221; SSSE3-NEXT: pandn %xmm4, %xmm9
2222; SSSE3-NEXT: por %xmm2, %xmm9
2223; SSSE3-NEXT: movdqa %xmm9, %xmm2
2224; SSSE3-NEXT: pxor %xmm12, %xmm2
2225; SSSE3-NEXT: movdqa %xmm2, %xmm3
2226; SSSE3-NEXT: pcmpgtd %xmm12, %xmm3
2227; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
2228; SSSE3-NEXT: pcmpeqd %xmm12, %xmm2
2229; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2230; SSSE3-NEXT: pand %xmm5, %xmm2
2231; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2232; SSSE3-NEXT: por %xmm2, %xmm3
2233; SSSE3-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
2234; SSSE3-NEXT: movdqa %xmm10, %xmm2
2235; SSSE3-NEXT: pxor %xmm12, %xmm2
2236; SSSE3-NEXT: movdqa %xmm2, %xmm3
2237; SSSE3-NEXT: pcmpgtd %xmm12, %xmm3
2238; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
2239; SSSE3-NEXT: pcmpeqd %xmm12, %xmm2
2240; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2241; SSSE3-NEXT: pand %xmm5, %xmm2
2242; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2243; SSSE3-NEXT: por %xmm2, %xmm3
2244; SSSE3-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
2245; SSSE3-NEXT: movdqa %xmm13, %xmm2
2246; SSSE3-NEXT: pxor %xmm12, %xmm2
2247; SSSE3-NEXT: movdqa %xmm2, %xmm3
2248; SSSE3-NEXT: pcmpgtd %xmm12, %xmm3
2249; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
2250; SSSE3-NEXT: pcmpeqd %xmm12, %xmm2
2251; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2252; SSSE3-NEXT: pand %xmm5, %xmm2
2253; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2254; SSSE3-NEXT: por %xmm2, %xmm3
2255; SSSE3-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
2256; SSSE3-NEXT: movdqa %xmm11, %xmm2
2257; SSSE3-NEXT: pxor %xmm12, %xmm2
2258; SSSE3-NEXT: movdqa %xmm2, %xmm3
2259; SSSE3-NEXT: pcmpgtd %xmm12, %xmm3
2260; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
2261; SSSE3-NEXT: pcmpeqd %xmm12, %xmm2
2262; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2263; SSSE3-NEXT: pand %xmm6, %xmm2
2264; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
2265; SSSE3-NEXT: por %xmm2, %xmm6
2266; SSSE3-NEXT: movdqa %xmm14, %xmm2
2267; SSSE3-NEXT: pxor %xmm12, %xmm2
2268; SSSE3-NEXT: movdqa %xmm2, %xmm3
2269; SSSE3-NEXT: pcmpgtd %xmm12, %xmm3
2270; SSSE3-NEXT: pcmpeqd %xmm12, %xmm2
2271; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2272; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
2273; SSSE3-NEXT: pand %xmm7, %xmm2
2274; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
2275; SSSE3-NEXT: por %xmm2, %xmm7
2276; SSSE3-NEXT: movdqa %xmm15, %xmm2
2277; SSSE3-NEXT: pxor %xmm12, %xmm2
2278; SSSE3-NEXT: movdqa %xmm2, %xmm3
2279; SSSE3-NEXT: pcmpgtd %xmm12, %xmm3
2280; SSSE3-NEXT: pcmpeqd %xmm12, %xmm2
2281; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2282; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
2283; SSSE3-NEXT: pand %xmm8, %xmm2
2284; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3]
2285; SSSE3-NEXT: por %xmm2, %xmm8
2286; SSSE3-NEXT: movdqa %xmm1, %xmm2
2287; SSSE3-NEXT: pxor %xmm12, %xmm2
2288; SSSE3-NEXT: movdqa %xmm2, %xmm3
2289; SSSE3-NEXT: pcmpgtd %xmm12, %xmm3
2290; SSSE3-NEXT: pcmpeqd %xmm12, %xmm2
2291; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2292; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
2293; SSSE3-NEXT: pand %xmm5, %xmm2
2294; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2295; SSSE3-NEXT: por %xmm2, %xmm3
2296; SSSE3-NEXT: movdqa %xmm0, %xmm2
2297; SSSE3-NEXT: pxor %xmm12, %xmm2
2298; SSSE3-NEXT: movdqa %xmm2, %xmm5
2299; SSSE3-NEXT: pcmpgtd %xmm12, %xmm5
2300; SSSE3-NEXT: pcmpeqd %xmm12, %xmm2
2301; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2302; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
2303; SSSE3-NEXT: pand %xmm4, %xmm2
2304; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3]
2305; SSSE3-NEXT: por %xmm2, %xmm12
2306; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
2307; SSSE3-NEXT: pand %xmm2, %xmm9
2308; SSSE3-NEXT: pand -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
2309; SSSE3-NEXT: pand %xmm2, %xmm10
2310; SSSE3-NEXT: pand -{{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
2311; SSSE3-NEXT: packuswb %xmm9, %xmm10
2312; SSSE3-NEXT: pand %xmm2, %xmm13
2313; SSSE3-NEXT: pand -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
2314; SSSE3-NEXT: pand %xmm2, %xmm11
2315; SSSE3-NEXT: pand %xmm6, %xmm11
2316; SSSE3-NEXT: packuswb %xmm13, %xmm11
2317; SSSE3-NEXT: packuswb %xmm10, %xmm11
2318; SSSE3-NEXT: pand %xmm2, %xmm14
2319; SSSE3-NEXT: pand %xmm7, %xmm14
2320; SSSE3-NEXT: pand %xmm2, %xmm15
2321; SSSE3-NEXT: pand %xmm8, %xmm15
2322; SSSE3-NEXT: packuswb %xmm14, %xmm15
2323; SSSE3-NEXT: pand %xmm2, %xmm1
2324; SSSE3-NEXT: pand %xmm3, %xmm1
2325; SSSE3-NEXT: pand %xmm2, %xmm0
2326; SSSE3-NEXT: pand %xmm12, %xmm0
2327; SSSE3-NEXT: packuswb %xmm1, %xmm0
2328; SSSE3-NEXT: packuswb %xmm15, %xmm0
2329; SSSE3-NEXT: packuswb %xmm11, %xmm0
2330; SSSE3-NEXT: retq
2331;
2332; SSE41-LABEL: trunc_packus_v16i64_v16i8:
2333; SSE41: # %bb.0:
2334; SSE41-NEXT: movdqa %xmm7, %xmm8
2335; SSE41-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
2336; SSE41-NEXT: movdqa %xmm6, %xmm7
2337; SSE41-NEXT: movdqa %xmm5, %xmm6
2338; SSE41-NEXT: movdqa %xmm4, %xmm5
2339; SSE41-NEXT: movdqa %xmm3, %xmm4
2340; SSE41-NEXT: movdqa %xmm0, %xmm11
2341; SSE41-NEXT: movdqa {{.*#+}} xmm13 = [2147483648,0,2147483648,0]
2342; SSE41-NEXT: movdqa %xmm8, %xmm0
2343; SSE41-NEXT: pxor %xmm13, %xmm0
2344; SSE41-NEXT: movdqa {{.*#+}} xmm14 = [2147483903,2147483903]
2345; SSE41-NEXT: movdqa %xmm14, %xmm3
2346; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
2347; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
2348; SSE41-NEXT: pcmpeqd %xmm14, %xmm0
2349; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2350; SSE41-NEXT: pand %xmm8, %xmm0
2351; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3]
2352; SSE41-NEXT: por %xmm0, %xmm8
2353; SSE41-NEXT: movdqa %xmm7, %xmm0
2354; SSE41-NEXT: pxor %xmm13, %xmm0
2355; SSE41-NEXT: movdqa %xmm14, %xmm3
2356; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
2357; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
2358; SSE41-NEXT: pcmpeqd %xmm14, %xmm0
2359; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2360; SSE41-NEXT: pand %xmm9, %xmm0
2361; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3]
2362; SSE41-NEXT: por %xmm0, %xmm10
2363; SSE41-NEXT: movdqa %xmm6, %xmm0
2364; SSE41-NEXT: pxor %xmm13, %xmm0
2365; SSE41-NEXT: movdqa %xmm14, %xmm3
2366; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
2367; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
2368; SSE41-NEXT: pcmpeqd %xmm14, %xmm0
2369; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2370; SSE41-NEXT: pand %xmm9, %xmm0
2371; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,3,3]
2372; SSE41-NEXT: por %xmm0, %xmm12
2373; SSE41-NEXT: movdqa %xmm11, %xmm0
2374; SSE41-NEXT: pxor %xmm13, %xmm0
2375; SSE41-NEXT: movdqa %xmm14, %xmm3
2376; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
2377; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
2378; SSE41-NEXT: pcmpeqd %xmm14, %xmm0
2379; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm0[1,1,3,3]
2380; SSE41-NEXT: pand %xmm9, %xmm15
2381; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
2382; SSE41-NEXT: por %xmm15, %xmm0
2383; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255]
2384; SSE41-NEXT: movapd %xmm3, %xmm9
2385; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm9
2386; SSE41-NEXT: movdqa %xmm1, %xmm0
2387; SSE41-NEXT: pxor %xmm13, %xmm0
2388; SSE41-NEXT: movdqa %xmm14, %xmm11
2389; SSE41-NEXT: pcmpgtd %xmm0, %xmm11
2390; SSE41-NEXT: pcmpeqd %xmm14, %xmm0
2391; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm0[1,1,3,3]
2392; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
2393; SSE41-NEXT: pand %xmm0, %xmm15
2394; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3]
2395; SSE41-NEXT: por %xmm15, %xmm0
2396; SSE41-NEXT: movapd %xmm3, %xmm11
2397; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11
2398; SSE41-NEXT: movdqa %xmm2, %xmm0
2399; SSE41-NEXT: pxor %xmm13, %xmm0
2400; SSE41-NEXT: movdqa %xmm14, %xmm1
2401; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
2402; SSE41-NEXT: pcmpeqd %xmm14, %xmm0
2403; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm0[1,1,3,3]
2404; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
2405; SSE41-NEXT: pand %xmm0, %xmm15
2406; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
2407; SSE41-NEXT: por %xmm15, %xmm0
2408; SSE41-NEXT: movapd %xmm3, %xmm15
2409; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm15
2410; SSE41-NEXT: movdqa %xmm5, %xmm0
2411; SSE41-NEXT: pxor %xmm13, %xmm0
2412; SSE41-NEXT: movdqa %xmm14, %xmm1
2413; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
2414; SSE41-NEXT: pcmpeqd %xmm14, %xmm0
2415; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2416; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
2417; SSE41-NEXT: pand %xmm2, %xmm0
2418; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2419; SSE41-NEXT: por %xmm0, %xmm1
2420; SSE41-NEXT: movdqa %xmm4, %xmm0
2421; SSE41-NEXT: pxor %xmm13, %xmm0
2422; SSE41-NEXT: movdqa %xmm14, %xmm2
2423; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
2424; SSE41-NEXT: pcmpeqd %xmm14, %xmm0
2425; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
2426; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
2427; SSE41-NEXT: pand %xmm0, %xmm14
2428; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
2429; SSE41-NEXT: por %xmm14, %xmm0
2430; SSE41-NEXT: movapd %xmm3, %xmm14
2431; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm14
2432; SSE41-NEXT: movapd %xmm3, %xmm2
2433; SSE41-NEXT: movdqa %xmm1, %xmm0
2434; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
2435; SSE41-NEXT: movapd %xmm3, %xmm4
2436; SSE41-NEXT: movdqa %xmm12, %xmm0
2437; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4
2438; SSE41-NEXT: movapd %xmm3, %xmm1
2439; SSE41-NEXT: movdqa %xmm10, %xmm0
2440; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
2441; SSE41-NEXT: movapd %xmm3, %xmm5
2442; SSE41-NEXT: movdqa %xmm8, %xmm0
2443; SSE41-NEXT: blendvpd %xmm0, -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
2444; SSE41-NEXT: movapd %xmm5, %xmm0
2445; SSE41-NEXT: xorpd %xmm13, %xmm0
2446; SSE41-NEXT: movapd %xmm0, %xmm6
2447; SSE41-NEXT: pcmpgtd %xmm13, %xmm6
2448; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
2449; SSE41-NEXT: pcmpeqd %xmm13, %xmm0
2450; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2451; SSE41-NEXT: pand %xmm7, %xmm0
2452; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2453; SSE41-NEXT: por %xmm0, %xmm6
2454; SSE41-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
2455; SSE41-NEXT: movapd %xmm1, %xmm0
2456; SSE41-NEXT: xorpd %xmm13, %xmm0
2457; SSE41-NEXT: movapd %xmm0, %xmm6
2458; SSE41-NEXT: pcmpgtd %xmm13, %xmm6
2459; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
2460; SSE41-NEXT: pcmpeqd %xmm13, %xmm0
2461; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2462; SSE41-NEXT: pand %xmm7, %xmm0
2463; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2464; SSE41-NEXT: por %xmm0, %xmm6
2465; SSE41-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
2466; SSE41-NEXT: movapd %xmm4, %xmm0
2467; SSE41-NEXT: xorpd %xmm13, %xmm0
2468; SSE41-NEXT: movapd %xmm0, %xmm6
2469; SSE41-NEXT: pcmpgtd %xmm13, %xmm6
2470; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
2471; SSE41-NEXT: pcmpeqd %xmm13, %xmm0
2472; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2473; SSE41-NEXT: pand %xmm7, %xmm0
2474; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2475; SSE41-NEXT: por %xmm0, %xmm6
2476; SSE41-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
2477; SSE41-NEXT: movapd %xmm2, %xmm0
2478; SSE41-NEXT: xorpd %xmm13, %xmm0
2479; SSE41-NEXT: movapd %xmm0, %xmm6
2480; SSE41-NEXT: pcmpgtd %xmm13, %xmm6
2481; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
2482; SSE41-NEXT: pcmpeqd %xmm13, %xmm0
2483; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2484; SSE41-NEXT: pand %xmm7, %xmm0
2485; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2486; SSE41-NEXT: por %xmm0, %xmm6
2487; SSE41-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
2488; SSE41-NEXT: movapd %xmm14, %xmm0
2489; SSE41-NEXT: xorpd %xmm13, %xmm0
2490; SSE41-NEXT: movapd %xmm0, %xmm6
2491; SSE41-NEXT: pcmpgtd %xmm13, %xmm6
2492; SSE41-NEXT: pcmpeqd %xmm13, %xmm0
2493; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2494; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
2495; SSE41-NEXT: pand %xmm7, %xmm0
2496; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
2497; SSE41-NEXT: por %xmm0, %xmm8
2498; SSE41-NEXT: movapd %xmm15, %xmm0
2499; SSE41-NEXT: xorpd %xmm13, %xmm0
2500; SSE41-NEXT: movapd %xmm0, %xmm7
2501; SSE41-NEXT: pcmpgtd %xmm13, %xmm7
2502; SSE41-NEXT: pcmpeqd %xmm13, %xmm0
2503; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2504; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
2505; SSE41-NEXT: pand %xmm10, %xmm0
2506; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3]
2507; SSE41-NEXT: por %xmm0, %xmm10
2508; SSE41-NEXT: movapd %xmm11, %xmm0
2509; SSE41-NEXT: xorpd %xmm13, %xmm0
2510; SSE41-NEXT: movapd %xmm0, %xmm7
2511; SSE41-NEXT: pcmpgtd %xmm13, %xmm7
2512; SSE41-NEXT: pcmpeqd %xmm13, %xmm0
2513; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2514; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2]
2515; SSE41-NEXT: pand %xmm12, %xmm0
2516; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm7[1,1,3,3]
2517; SSE41-NEXT: por %xmm0, %xmm12
2518; SSE41-NEXT: movapd %xmm9, %xmm0
2519; SSE41-NEXT: xorpd %xmm13, %xmm0
2520; SSE41-NEXT: movapd %xmm0, %xmm7
2521; SSE41-NEXT: pcmpgtd %xmm13, %xmm7
2522; SSE41-NEXT: pcmpeqd %xmm13, %xmm0
2523; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2524; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
2525; SSE41-NEXT: pand %xmm6, %xmm0
2526; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
2527; SSE41-NEXT: por %xmm0, %xmm13
2528; SSE41-NEXT: andpd %xmm3, %xmm5
2529; SSE41-NEXT: andpd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
2530; SSE41-NEXT: andpd %xmm3, %xmm1
2531; SSE41-NEXT: andpd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
2532; SSE41-NEXT: packuswb %xmm5, %xmm1
2533; SSE41-NEXT: andpd %xmm3, %xmm4
2534; SSE41-NEXT: andpd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
2535; SSE41-NEXT: andpd %xmm3, %xmm2
2536; SSE41-NEXT: andpd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
2537; SSE41-NEXT: packuswb %xmm4, %xmm2
2538; SSE41-NEXT: packuswb %xmm1, %xmm2
2539; SSE41-NEXT: andpd %xmm3, %xmm14
2540; SSE41-NEXT: andpd %xmm8, %xmm14
2541; SSE41-NEXT: andpd %xmm3, %xmm15
2542; SSE41-NEXT: andpd %xmm10, %xmm15
2543; SSE41-NEXT: packuswb %xmm14, %xmm15
2544; SSE41-NEXT: andpd %xmm3, %xmm11
2545; SSE41-NEXT: andpd %xmm12, %xmm11
2546; SSE41-NEXT: andpd %xmm3, %xmm9
2547; SSE41-NEXT: andpd %xmm13, %xmm9
2548; SSE41-NEXT: packuswb %xmm11, %xmm9
2549; SSE41-NEXT: packuswb %xmm15, %xmm9
2550; SSE41-NEXT: packuswb %xmm2, %xmm9
2551; SSE41-NEXT: movdqa %xmm9, %xmm0
2552; SSE41-NEXT: retq
2553;
2554; AVX1-LABEL: trunc_packus_v16i64_v16i8:
2555; AVX1: # %bb.0:
2556; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
2557; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255]
2558; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm5
2559; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm6
2560; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm9
2561; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
2562; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6
2563; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm7
2564; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
2565; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
2566; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7
2567; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm8
2568; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7
2569; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
2570; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm5
2571; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8
2572; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5
2573; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [255,255,255,255]
2574; AVX1-NEXT: vblendvpd %ymm5, %ymm3, %ymm8, %ymm3
2575; AVX1-NEXT: vblendvpd %ymm7, %ymm2, %ymm8, %ymm2
2576; AVX1-NEXT: vblendvpd %ymm6, %ymm1, %ymm8, %ymm5
2577; AVX1-NEXT: vblendvpd %ymm9, %ymm0, %ymm8, %ymm0
2578; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2579; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
2580; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm7
2581; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
2582; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm6
2583; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm7
2584; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2585; AVX1-NEXT: vpand %xmm3, %xmm7, %xmm3
2586; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
2587; AVX1-NEXT: vpackuswb %xmm6, %xmm3, %xmm3
2588; AVX1-NEXT: vpcmpgtq %xmm1, %xmm7, %xmm6
2589; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
2590; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
2591; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm7
2592; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2593; AVX1-NEXT: vpand %xmm2, %xmm7, %xmm2
2594; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
2595; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
2596; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2597; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm3
2598; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
2599; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
2600; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm6
2601; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
2602; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5
2603; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
2604; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
2605; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm5
2606; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
2607; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
2608; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
2609; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2610; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
2611; AVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0
2612; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
2613; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2614; AVX1-NEXT: vzeroupper
2615; AVX1-NEXT: retq
2616;
2617; AVX2-SLOW-LABEL: trunc_packus_v16i64_v16i8:
2618; AVX2-SLOW: # %bb.0:
2619; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
2620; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5
2621; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm6
2622; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm7
2623; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm8
2624; AVX2-SLOW-NEXT: vblendvpd %ymm8, %ymm1, %ymm4, %ymm1
2625; AVX2-SLOW-NEXT: vblendvpd %ymm7, %ymm0, %ymm4, %ymm0
2626; AVX2-SLOW-NEXT: vblendvpd %ymm6, %ymm3, %ymm4, %ymm3
2627; AVX2-SLOW-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
2628; AVX2-SLOW-NEXT: vpxor %xmm4, %xmm4, %xmm4
2629; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm5
2630; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm6
2631; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm7
2632; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm4
2633; AVX2-SLOW-NEXT: vpand %ymm1, %ymm4, %ymm1
2634; AVX2-SLOW-NEXT: vpand %ymm0, %ymm7, %ymm0
2635; AVX2-SLOW-NEXT: vpand %ymm3, %ymm6, %ymm3
2636; AVX2-SLOW-NEXT: vpand %ymm2, %ymm5, %ymm2
2637; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
2638; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2639; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
2640; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
2641; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2642; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2643; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
2644; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2645; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2646; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
2647; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2648; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2649; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
2650; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2651; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2652; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
2653; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2654; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
2655; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2656; AVX2-SLOW-NEXT: vzeroupper
2657; AVX2-SLOW-NEXT: retq
2658;
2659; AVX2-FAST-LABEL: trunc_packus_v16i64_v16i8:
2660; AVX2-FAST: # %bb.0:
2661; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
2662; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5
2663; AVX2-FAST-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm6
2664; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm7
2665; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm8
2666; AVX2-FAST-NEXT: vblendvpd %ymm8, %ymm1, %ymm4, %ymm1
2667; AVX2-FAST-NEXT: vblendvpd %ymm7, %ymm0, %ymm4, %ymm0
2668; AVX2-FAST-NEXT: vblendvpd %ymm6, %ymm3, %ymm4, %ymm3
2669; AVX2-FAST-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
2670; AVX2-FAST-NEXT: vpxor %xmm4, %xmm4, %xmm4
2671; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm5
2672; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm6
2673; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm7
2674; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm4
2675; AVX2-FAST-NEXT: vpand %ymm1, %ymm4, %ymm1
2676; AVX2-FAST-NEXT: vpand %ymm0, %ymm7, %ymm0
2677; AVX2-FAST-NEXT: vpand %ymm3, %ymm6, %ymm3
2678; AVX2-FAST-NEXT: vpand %ymm2, %ymm5, %ymm2
2679; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
2680; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
2681; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
2682; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2683; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2684; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
2685; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2686; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2687; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
2688; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
2689; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
2690; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2691; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
2692; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2693; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
2694; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2695; AVX2-FAST-NEXT: vzeroupper
2696; AVX2-FAST-NEXT: retq
2697;
2698; AVX512-LABEL: trunc_packus_v16i64_v16i8:
2699; AVX512: # %bb.0:
2700; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
2701; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0
2702; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1
2703; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
2704; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
2705; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
2706; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2707; AVX512-NEXT: vpmovqd %zmm1, %ymm1
2708; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2709; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2710; AVX512-NEXT: vzeroupper
2711; AVX512-NEXT: retq
2712 %1 = icmp slt <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
2713 %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
2714 %3 = icmp sgt <16 x i64> %2, zeroinitializer
2715 %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> zeroinitializer
2716 %5 = trunc <16 x i64> %4 to <16 x i8>
2717 ret <16 x i8> %5
2718}
2719
2720define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) {
2721; SSE2-LABEL: trunc_packus_v8i32_v8i8:
2722; SSE2: # %bb.0:
2723; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
2724; SSE2-NEXT: movdqa %xmm2, %xmm3
2725; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
2726; SSE2-NEXT: movdqa %xmm2, %xmm4
2727; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
2728; SSE2-NEXT: pand %xmm4, %xmm0
2729; SSE2-NEXT: pandn %xmm2, %xmm4
2730; SSE2-NEXT: por %xmm0, %xmm4
2731; SSE2-NEXT: pand %xmm3, %xmm1
2732; SSE2-NEXT: pandn %xmm2, %xmm3
2733; SSE2-NEXT: por %xmm1, %xmm3
2734; SSE2-NEXT: pxor %xmm1, %xmm1
2735; SSE2-NEXT: movdqa %xmm3, %xmm2
2736; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
2737; SSE2-NEXT: movdqa %xmm4, %xmm0
2738; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
2739; SSE2-NEXT: pand %xmm4, %xmm0
2740; SSE2-NEXT: pand %xmm3, %xmm2
2741; SSE2-NEXT: pslld $16, %xmm2
2742; SSE2-NEXT: psrad $16, %xmm2
2743; SSE2-NEXT: pslld $16, %xmm0
2744; SSE2-NEXT: psrad $16, %xmm0
2745; SSE2-NEXT: packssdw %xmm2, %xmm0
2746; SSE2-NEXT: retq
2747;
2748; SSSE3-LABEL: trunc_packus_v8i32_v8i8:
2749; SSSE3: # %bb.0:
2750; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
2751; SSSE3-NEXT: movdqa %xmm2, %xmm3
2752; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
2753; SSSE3-NEXT: movdqa %xmm2, %xmm4
2754; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
2755; SSSE3-NEXT: pand %xmm4, %xmm0
2756; SSSE3-NEXT: pandn %xmm2, %xmm4
2757; SSSE3-NEXT: por %xmm0, %xmm4
2758; SSSE3-NEXT: pand %xmm3, %xmm1
2759; SSSE3-NEXT: pandn %xmm2, %xmm3
2760; SSSE3-NEXT: por %xmm1, %xmm3
2761; SSSE3-NEXT: pxor %xmm1, %xmm1
2762; SSSE3-NEXT: movdqa %xmm3, %xmm2
2763; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
2764; SSSE3-NEXT: movdqa %xmm4, %xmm0
2765; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
2766; SSSE3-NEXT: pand %xmm4, %xmm0
2767; SSSE3-NEXT: pand %xmm3, %xmm2
2768; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2769; SSSE3-NEXT: pshufb %xmm1, %xmm2
2770; SSSE3-NEXT: pshufb %xmm1, %xmm0
2771; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2772; SSSE3-NEXT: retq
2773;
2774; SSE41-LABEL: trunc_packus_v8i32_v8i8:
2775; SSE41: # %bb.0:
2776; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
2777; SSE41-NEXT: pminsd %xmm2, %xmm1
2778; SSE41-NEXT: pminsd %xmm2, %xmm0
2779; SSE41-NEXT: pxor %xmm2, %xmm2
2780; SSE41-NEXT: pmaxsd %xmm2, %xmm0
2781; SSE41-NEXT: pmaxsd %xmm2, %xmm1
2782; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2783; SSE41-NEXT: pshufb %xmm2, %xmm1
2784; SSE41-NEXT: pshufb %xmm2, %xmm0
2785; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2786; SSE41-NEXT: retq
2787;
2788; AVX1-LABEL: trunc_packus_v8i32_v8i8:
2789; AVX1: # %bb.0:
2790; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2791; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255]
2792; AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1
2793; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
2794; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2795; AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
2796; AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1
2797; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2798; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2799; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2800; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2801; AVX1-NEXT: vzeroupper
2802; AVX1-NEXT: retq
2803;
2804; AVX2-LABEL: trunc_packus_v8i32_v8i8:
2805; AVX2: # %bb.0:
2806; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
2807; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
2808; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2809; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
2810; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2811; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2812; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
2813; AVX2-NEXT: vzeroupper
2814; AVX2-NEXT: retq
2815;
2816; AVX512F-LABEL: trunc_packus_v8i32_v8i8:
2817; AVX512F: # %bb.0:
2818; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
2819; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
2820; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
2821; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
2822; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
2823; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
2824; AVX512F-NEXT: vzeroupper
2825; AVX512F-NEXT: retq
2826;
2827; AVX512VL-LABEL: trunc_packus_v8i32_v8i8:
2828; AVX512VL: # %bb.0:
2829; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
2830; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
2831; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
2832; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
2833; AVX512VL-NEXT: vzeroupper
2834; AVX512VL-NEXT: retq
2835;
2836; AVX512BW-LABEL: trunc_packus_v8i32_v8i8:
2837; AVX512BW: # %bb.0:
2838; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
2839; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
2840; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2841; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
2842; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
2843; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
2844; AVX512BW-NEXT: vzeroupper
2845; AVX512BW-NEXT: retq
2846;
2847; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8:
2848; AVX512BWVL: # %bb.0:
2849; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
2850; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
2851; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
2852; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
2853; AVX512BWVL-NEXT: vzeroupper
2854; AVX512BWVL-NEXT: retq
2855 %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
2856 %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
2857 %3 = icmp sgt <8 x i32> %2, zeroinitializer
2858 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
2859 %5 = trunc <8 x i32> %4 to <8 x i8>
2860 ret <8 x i8> %5
2861}
2862
2863define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32> %a0) {
2864; SSE2-LABEL: trunc_packus_v16i32_v16i8:
2865; SSE2: # %bb.0:
2866; SSE2-NEXT: movdqa %xmm0, %xmm4
2867; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255]
2868; SSE2-NEXT: movdqa %xmm8, %xmm6
2869; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
2870; SSE2-NEXT: movdqa %xmm8, %xmm5
2871; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
2872; SSE2-NEXT: movdqa %xmm8, %xmm7
2873; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
2874; SSE2-NEXT: movdqa %xmm8, %xmm0
2875; SSE2-NEXT: pcmpgtd %xmm4, %xmm0
2876; SSE2-NEXT: pand %xmm0, %xmm4
2877; SSE2-NEXT: pandn %xmm8, %xmm0
2878; SSE2-NEXT: por %xmm4, %xmm0
2879; SSE2-NEXT: pand %xmm7, %xmm1
2880; SSE2-NEXT: pandn %xmm8, %xmm7
2881; SSE2-NEXT: por %xmm1, %xmm7
2882; SSE2-NEXT: pand %xmm5, %xmm2
2883; SSE2-NEXT: pandn %xmm8, %xmm5
2884; SSE2-NEXT: por %xmm2, %xmm5
2885; SSE2-NEXT: pand %xmm6, %xmm3
2886; SSE2-NEXT: pandn %xmm8, %xmm6
2887; SSE2-NEXT: por %xmm3, %xmm6
2888; SSE2-NEXT: pxor %xmm9, %xmm9
2889; SSE2-NEXT: movdqa %xmm6, %xmm2
2890; SSE2-NEXT: pcmpgtd %xmm9, %xmm2
2891; SSE2-NEXT: movdqa %xmm5, %xmm3
2892; SSE2-NEXT: pcmpgtd %xmm9, %xmm3
2893; SSE2-NEXT: movdqa %xmm7, %xmm4
2894; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
2895; SSE2-NEXT: movdqa %xmm0, %xmm1
2896; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
2897; SSE2-NEXT: pand %xmm8, %xmm6
2898; SSE2-NEXT: pand %xmm2, %xmm6
2899; SSE2-NEXT: pand %xmm8, %xmm5
2900; SSE2-NEXT: pand %xmm3, %xmm5
2901; SSE2-NEXT: packuswb %xmm6, %xmm5
2902; SSE2-NEXT: pand %xmm8, %xmm7
2903; SSE2-NEXT: pand %xmm4, %xmm7
2904; SSE2-NEXT: pand %xmm8, %xmm0
2905; SSE2-NEXT: pand %xmm1, %xmm0
2906; SSE2-NEXT: packuswb %xmm7, %xmm0
2907; SSE2-NEXT: packuswb %xmm5, %xmm0
2908; SSE2-NEXT: retq
2909;
2910; SSSE3-LABEL: trunc_packus_v16i32_v16i8:
2911; SSSE3: # %bb.0:
2912; SSSE3-NEXT: movdqa %xmm0, %xmm4
2913; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255]
2914; SSSE3-NEXT: movdqa %xmm8, %xmm6
2915; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
2916; SSSE3-NEXT: movdqa %xmm8, %xmm5
2917; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
2918; SSSE3-NEXT: movdqa %xmm8, %xmm7
2919; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
2920; SSSE3-NEXT: movdqa %xmm8, %xmm0
2921; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0
2922; SSSE3-NEXT: pand %xmm0, %xmm4
2923; SSSE3-NEXT: pandn %xmm8, %xmm0
2924; SSSE3-NEXT: por %xmm4, %xmm0
2925; SSSE3-NEXT: pand %xmm7, %xmm1
2926; SSSE3-NEXT: pandn %xmm8, %xmm7
2927; SSSE3-NEXT: por %xmm1, %xmm7
2928; SSSE3-NEXT: pand %xmm5, %xmm2
2929; SSSE3-NEXT: pandn %xmm8, %xmm5
2930; SSSE3-NEXT: por %xmm2, %xmm5
2931; SSSE3-NEXT: pand %xmm6, %xmm3
2932; SSSE3-NEXT: pandn %xmm8, %xmm6
2933; SSSE3-NEXT: por %xmm3, %xmm6
2934; SSSE3-NEXT: pxor %xmm9, %xmm9
2935; SSSE3-NEXT: movdqa %xmm6, %xmm2
2936; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2
2937; SSSE3-NEXT: movdqa %xmm5, %xmm3
2938; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3
2939; SSSE3-NEXT: movdqa %xmm7, %xmm4
2940; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4
2941; SSSE3-NEXT: movdqa %xmm0, %xmm1
2942; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
2943; SSSE3-NEXT: pand %xmm8, %xmm6
2944; SSSE3-NEXT: pand %xmm2, %xmm6
2945; SSSE3-NEXT: pand %xmm8, %xmm5
2946; SSSE3-NEXT: pand %xmm3, %xmm5
2947; SSSE3-NEXT: packuswb %xmm6, %xmm5
2948; SSSE3-NEXT: pand %xmm8, %xmm7
2949; SSSE3-NEXT: pand %xmm4, %xmm7
2950; SSSE3-NEXT: pand %xmm8, %xmm0
2951; SSSE3-NEXT: pand %xmm1, %xmm0
2952; SSSE3-NEXT: packuswb %xmm7, %xmm0
2953; SSSE3-NEXT: packuswb %xmm5, %xmm0
2954; SSSE3-NEXT: retq
2955;
2956; SSE41-LABEL: trunc_packus_v16i32_v16i8:
2957; SSE41: # %bb.0:
2958; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255]
2959; SSE41-NEXT: pminsd %xmm4, %xmm3
2960; SSE41-NEXT: pminsd %xmm4, %xmm2
2961; SSE41-NEXT: pminsd %xmm4, %xmm1
2962; SSE41-NEXT: pminsd %xmm4, %xmm0
2963; SSE41-NEXT: pxor %xmm5, %xmm5
2964; SSE41-NEXT: pmaxsd %xmm5, %xmm0
2965; SSE41-NEXT: pmaxsd %xmm5, %xmm1
2966; SSE41-NEXT: pmaxsd %xmm5, %xmm2
2967; SSE41-NEXT: pmaxsd %xmm5, %xmm3
2968; SSE41-NEXT: pand %xmm4, %xmm3
2969; SSE41-NEXT: pand %xmm4, %xmm2
2970; SSE41-NEXT: packuswb %xmm3, %xmm2
2971; SSE41-NEXT: pand %xmm4, %xmm1
2972; SSE41-NEXT: pand %xmm4, %xmm0
2973; SSE41-NEXT: packuswb %xmm1, %xmm0
2974; SSE41-NEXT: packuswb %xmm2, %xmm0
2975; SSE41-NEXT: retq
2976;
2977; AVX1-LABEL: trunc_packus_v16i32_v16i8:
2978; AVX1: # %bb.0:
2979; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2980; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
2981; AVX1-NEXT: vpminsd %xmm3, %xmm2, %xmm2
2982; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm1
2983; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
2984; AVX1-NEXT: vpminsd %xmm3, %xmm4, %xmm4
2985; AVX1-NEXT: vpminsd %xmm3, %xmm0, %xmm0
2986; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
2987; AVX1-NEXT: vpmaxsd %xmm5, %xmm0, %xmm0
2988; AVX1-NEXT: vpmaxsd %xmm5, %xmm4, %xmm4
2989; AVX1-NEXT: vpmaxsd %xmm5, %xmm1, %xmm1
2990; AVX1-NEXT: vpmaxsd %xmm5, %xmm2, %xmm2
2991; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2992; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
2993; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2994; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
2995; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2996; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2997; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2998; AVX1-NEXT: vzeroupper
2999; AVX1-NEXT: retq
3000;
3001; AVX2-LABEL: trunc_packus_v16i32_v16i8:
3002; AVX2: # %bb.0:
3003; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3004; AVX2-NEXT: vpminsd %ymm2, %ymm1, %ymm1
3005; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0
3006; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3007; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0
3008; AVX2-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1
3009; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3010; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
3011; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3012; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3013; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
3014; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
3015; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3016; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
3017; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3018; AVX2-NEXT: vzeroupper
3019; AVX2-NEXT: retq
3020;
3021; AVX512-LABEL: trunc_packus_v16i32_v16i8:
3022; AVX512: # %bb.0:
3023; AVX512-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0
3024; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
3025; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
3026; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3027; AVX512-NEXT: vzeroupper
3028; AVX512-NEXT: retq
3029 %1 = icmp slt <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
3030 %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
3031 %3 = icmp sgt <16 x i32> %2, zeroinitializer
3032 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
3033 %5 = trunc <16 x i32> %4 to <16 x i8>
3034 ret <16 x i8> %5
3035}
3036
3037define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) {
3038; SSE2-LABEL: trunc_packus_v16i16_v16i8:
3039; SSE2: # %bb.0:
3040; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
3041; SSE2-NEXT: pminsw %xmm2, %xmm1
3042; SSE2-NEXT: pminsw %xmm2, %xmm0
3043; SSE2-NEXT: pxor %xmm3, %xmm3
3044; SSE2-NEXT: pmaxsw %xmm3, %xmm0
3045; SSE2-NEXT: pmaxsw %xmm3, %xmm1
3046; SSE2-NEXT: pand %xmm2, %xmm1
3047; SSE2-NEXT: pand %xmm2, %xmm0
3048; SSE2-NEXT: packuswb %xmm1, %xmm0
3049; SSE2-NEXT: retq
3050;
3051; SSSE3-LABEL: trunc_packus_v16i16_v16i8:
3052; SSSE3: # %bb.0:
3053; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
3054; SSSE3-NEXT: pminsw %xmm2, %xmm1
3055; SSSE3-NEXT: pminsw %xmm2, %xmm0
3056; SSSE3-NEXT: pxor %xmm2, %xmm2
3057; SSSE3-NEXT: pmaxsw %xmm2, %xmm0
3058; SSSE3-NEXT: pmaxsw %xmm2, %xmm1
3059; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3060; SSSE3-NEXT: pshufb %xmm2, %xmm1
3061; SSSE3-NEXT: pshufb %xmm2, %xmm0
3062; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3063; SSSE3-NEXT: retq
3064;
3065; SSE41-LABEL: trunc_packus_v16i16_v16i8:
3066; SSE41: # %bb.0:
3067; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
3068; SSE41-NEXT: pminsw %xmm2, %xmm1
3069; SSE41-NEXT: pminsw %xmm2, %xmm0
3070; SSE41-NEXT: pxor %xmm2, %xmm2
3071; SSE41-NEXT: pmaxsw %xmm2, %xmm0
3072; SSE41-NEXT: pmaxsw %xmm2, %xmm1
3073; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3074; SSE41-NEXT: pshufb %xmm2, %xmm1
3075; SSE41-NEXT: pshufb %xmm2, %xmm0
3076; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3077; SSE41-NEXT: retq
3078;
3079; AVX1-LABEL: trunc_packus_v16i16_v16i8:
3080; AVX1: # %bb.0:
3081; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3082; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
3083; AVX1-NEXT: vpminsw %xmm2, %xmm1, %xmm1
3084; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
3085; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
3086; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
3087; AVX1-NEXT: vpmaxsw %xmm2, %xmm1, %xmm1
3088; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3089; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3090; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3091; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3092; AVX1-NEXT: vzeroupper
3093; AVX1-NEXT: retq
3094;
3095; AVX2-LABEL: trunc_packus_v16i16_v16i8:
3096; AVX2: # %bb.0:
3097; AVX2-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
3098; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
3099; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
3100; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3101; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3102; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3103; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3104; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3105; AVX2-NEXT: vzeroupper
3106; AVX2-NEXT: retq
3107;
3108; AVX512F-LABEL: trunc_packus_v16i16_v16i8:
3109; AVX512F: # %bb.0:
3110; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
3111; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
3112; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
3113; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
3114; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3115; AVX512F-NEXT: vzeroupper
3116; AVX512F-NEXT: retq
3117;
3118; AVX512VL-LABEL: trunc_packus_v16i16_v16i8:
3119; AVX512VL: # %bb.0:
3120; AVX512VL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
3121; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3122; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
3123; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
3124; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
3125; AVX512VL-NEXT: vzeroupper
3126; AVX512VL-NEXT: retq
3127;
3128; AVX512BW-LABEL: trunc_packus_v16i16_v16i8:
3129; AVX512BW: # %bb.0:
3130; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
3131; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
3132; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
3133; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3134; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
3135; AVX512BW-NEXT: vzeroupper
3136; AVX512BW-NEXT: retq
3137;
3138; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8:
3139; AVX512BWVL: # %bb.0:
3140; AVX512BWVL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
3141; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3142; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
3143; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
3144; AVX512BWVL-NEXT: vzeroupper
3145; AVX512BWVL-NEXT: retq
3146 %1 = icmp slt <16 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
3147 %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
3148 %3 = icmp sgt <16 x i16> %2, zeroinitializer
3149 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
3150 %5 = trunc <16 x i16> %4 to <16 x i8>
3151 ret <16 x i8> %5
3152}
3153
3154define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16> %a0) {
3155; SSE2-LABEL: trunc_packus_v32i16_v32i8:
3156; SSE2: # %bb.0:
3157; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3158; SSE2-NEXT: pminsw %xmm4, %xmm1
3159; SSE2-NEXT: pminsw %xmm4, %xmm0
3160; SSE2-NEXT: pminsw %xmm4, %xmm3
3161; SSE2-NEXT: pminsw %xmm4, %xmm2
3162; SSE2-NEXT: pxor %xmm5, %xmm5
3163; SSE2-NEXT: pmaxsw %xmm5, %xmm2
3164; SSE2-NEXT: pmaxsw %xmm5, %xmm3
3165; SSE2-NEXT: pmaxsw %xmm5, %xmm0
3166; SSE2-NEXT: pmaxsw %xmm5, %xmm1
3167; SSE2-NEXT: pand %xmm4, %xmm1
3168; SSE2-NEXT: pand %xmm4, %xmm0
3169; SSE2-NEXT: packuswb %xmm1, %xmm0
3170; SSE2-NEXT: pand %xmm4, %xmm3
3171; SSE2-NEXT: pand %xmm2, %xmm4
3172; SSE2-NEXT: packuswb %xmm3, %xmm4
3173; SSE2-NEXT: movdqa %xmm4, %xmm1
3174; SSE2-NEXT: retq
3175;
3176; SSSE3-LABEL: trunc_packus_v32i16_v32i8:
3177; SSSE3: # %bb.0:
3178; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3179; SSSE3-NEXT: pminsw %xmm4, %xmm1
3180; SSSE3-NEXT: pminsw %xmm4, %xmm0
3181; SSSE3-NEXT: pminsw %xmm4, %xmm3
3182; SSSE3-NEXT: pminsw %xmm2, %xmm4
3183; SSSE3-NEXT: pxor %xmm2, %xmm2
3184; SSSE3-NEXT: pmaxsw %xmm2, %xmm4
3185; SSSE3-NEXT: pmaxsw %xmm2, %xmm3
3186; SSSE3-NEXT: pmaxsw %xmm2, %xmm0
3187; SSSE3-NEXT: pmaxsw %xmm2, %xmm1
3188; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3189; SSSE3-NEXT: pshufb %xmm2, %xmm1
3190; SSSE3-NEXT: pshufb %xmm2, %xmm0
3191; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3192; SSSE3-NEXT: pshufb %xmm2, %xmm3
3193; SSSE3-NEXT: pshufb %xmm2, %xmm4
3194; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
3195; SSSE3-NEXT: movdqa %xmm4, %xmm1
3196; SSSE3-NEXT: retq
3197;
3198; SSE41-LABEL: trunc_packus_v32i16_v32i8:
3199; SSE41: # %bb.0:
3200; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3201; SSE41-NEXT: pminsw %xmm4, %xmm1
3202; SSE41-NEXT: pminsw %xmm4, %xmm0
3203; SSE41-NEXT: pminsw %xmm4, %xmm3
3204; SSE41-NEXT: pminsw %xmm2, %xmm4
3205; SSE41-NEXT: pxor %xmm2, %xmm2
3206; SSE41-NEXT: pmaxsw %xmm2, %xmm4
3207; SSE41-NEXT: pmaxsw %xmm2, %xmm3
3208; SSE41-NEXT: pmaxsw %xmm2, %xmm0
3209; SSE41-NEXT: pmaxsw %xmm2, %xmm1
3210; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3211; SSE41-NEXT: pshufb %xmm2, %xmm1
3212; SSE41-NEXT: pshufb %xmm2, %xmm0
3213; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3214; SSE41-NEXT: pshufb %xmm2, %xmm3
3215; SSE41-NEXT: pshufb %xmm2, %xmm4
3216; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
3217; SSE41-NEXT: movdqa %xmm4, %xmm1
3218; SSE41-NEXT: retq
3219;
3220; AVX1-LABEL: trunc_packus_v32i16_v32i8:
3221; AVX1: # %bb.0:
3222; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3223; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3224; AVX1-NEXT: vpminsw %xmm3, %xmm2, %xmm2
3225; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm1
3226; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
3227; AVX1-NEXT: vpminsw %xmm3, %xmm4, %xmm4
3228; AVX1-NEXT: vpminsw %xmm3, %xmm0, %xmm0
3229; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
3230; AVX1-NEXT: vpmaxsw %xmm3, %xmm0, %xmm0
3231; AVX1-NEXT: vpmaxsw %xmm3, %xmm4, %xmm4
3232; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm1
3233; AVX1-NEXT: vpmaxsw %xmm3, %xmm2, %xmm2
3234; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3235; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
3236; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
3237; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
3238; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm2
3239; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
3240; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3241; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
3242; AVX1-NEXT: retq
3243;
3244; AVX2-LABEL: trunc_packus_v32i16_v32i8:
3245; AVX2: # %bb.0:
3246; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3247; AVX2-NEXT: vpminsw %ymm2, %ymm1, %ymm1
3248; AVX2-NEXT: vpminsw %ymm2, %ymm0, %ymm0
3249; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3250; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
3251; AVX2-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1
3252; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3253; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3254; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
3255; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
3256; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
3257; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
3258; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
3259; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
3260; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3261; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3262; AVX2-NEXT: retq
3263;
3264; AVX512F-LABEL: trunc_packus_v32i16_v32i8:
3265; AVX512F: # %bb.0:
3266; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3267; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm0
3268; AVX512F-NEXT: vpminsw %ymm2, %ymm1, %ymm1
3269; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
3270; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1
3271; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
3272; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
3273; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3274; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
3275; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
3276; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3277; AVX512F-NEXT: retq
3278;
3279; AVX512VL-LABEL: trunc_packus_v32i16_v32i8:
3280; AVX512VL: # %bb.0:
3281; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3282; AVX512VL-NEXT: vpminsw %ymm2, %ymm0, %ymm0
3283; AVX512VL-NEXT: vpminsw %ymm2, %ymm1, %ymm1
3284; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
3285; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1
3286; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
3287; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
3288; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
3289; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1
3290; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
3291; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3292; AVX512VL-NEXT: retq
3293;
3294; AVX512BW-LABEL: trunc_packus_v32i16_v32i8:
3295; AVX512BW: # %bb.0:
3296; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %zmm0, %zmm0
3297; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
3298; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
3299; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3300; AVX512BW-NEXT: retq
3301;
3302; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8:
3303; AVX512BWVL: # %bb.0:
3304; AVX512BWVL-NEXT: vpminsw {{.*}}(%rip), %zmm0, %zmm0
3305; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3306; AVX512BWVL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
3307; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
3308; AVX512BWVL-NEXT: retq
3309 %1 = icmp slt <32 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
3310 %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
3311 %3 = icmp sgt <32 x i16> %2, zeroinitializer
3312 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
3313 %5 = trunc <32 x i16> %4 to <32 x i8>
3314 ret <32 x i8> %5
3315}