blob: 8f5aac493b5428e97fd4b10d01aaffd1557a4c11 [file] [log] [blame]
Simon Pilgrim879c5b12017-11-05 19:48:24 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
3; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
4; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
5; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
6; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
7; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
9; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
10; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
11
12;
13; 128-bit Vectors
14;
15
16define i64 @test_reduce_v2i64(<2 x i64> %a0) {
17; X86-SSE2-LABEL: test_reduce_v2i64:
18; X86-SSE2: ## BB#0:
19; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
20; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
21; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
22; X86-SSE2-NEXT: pxor %xmm2, %xmm3
23; X86-SSE2-NEXT: pxor %xmm1, %xmm2
24; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
25; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
26; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
27; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
28; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
29; X86-SSE2-NEXT: pand %xmm5, %xmm2
30; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
31; X86-SSE2-NEXT: por %xmm2, %xmm3
32; X86-SSE2-NEXT: pand %xmm3, %xmm0
33; X86-SSE2-NEXT: pandn %xmm1, %xmm3
34; X86-SSE2-NEXT: por %xmm0, %xmm3
35; X86-SSE2-NEXT: movd %xmm3, %eax
36; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
37; X86-SSE2-NEXT: movd %xmm0, %edx
38; X86-SSE2-NEXT: retl
39;
40; X86-SSE42-LABEL: test_reduce_v2i64:
41; X86-SSE42: ## BB#0:
42; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
43; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
44; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
45; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
46; X86-SSE42-NEXT: movd %xmm2, %eax
47; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
48; X86-SSE42-NEXT: retl
49;
50; X86-AVX-LABEL: test_reduce_v2i64:
51; X86-AVX: ## BB#0:
52; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
53; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
54; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
55; X86-AVX-NEXT: vmovd %xmm0, %eax
56; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
57; X86-AVX-NEXT: retl
58;
59; X64-SSE2-LABEL: test_reduce_v2i64:
60; X64-SSE2: ## BB#0:
61; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
62; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
63; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
64; X64-SSE2-NEXT: pxor %xmm2, %xmm3
65; X64-SSE2-NEXT: pxor %xmm1, %xmm2
66; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
67; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
68; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
69; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
70; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
71; X64-SSE2-NEXT: pand %xmm5, %xmm2
72; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
73; X64-SSE2-NEXT: por %xmm2, %xmm3
74; X64-SSE2-NEXT: pand %xmm3, %xmm0
75; X64-SSE2-NEXT: pandn %xmm1, %xmm3
76; X64-SSE2-NEXT: por %xmm0, %xmm3
77; X64-SSE2-NEXT: movq %xmm3, %rax
78; X64-SSE2-NEXT: retq
79;
80; X64-SSE42-LABEL: test_reduce_v2i64:
81; X64-SSE42: ## BB#0:
82; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
83; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
84; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
85; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
86; X64-SSE42-NEXT: movq %xmm2, %rax
87; X64-SSE42-NEXT: retq
88;
89; X64-AVX1-LABEL: test_reduce_v2i64:
90; X64-AVX1: ## BB#0:
91; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
92; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
93; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
94; X64-AVX1-NEXT: vmovq %xmm0, %rax
95; X64-AVX1-NEXT: retq
96;
97; X64-AVX2-LABEL: test_reduce_v2i64:
98; X64-AVX2: ## BB#0:
99; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
100; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
101; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
102; X64-AVX2-NEXT: vmovq %xmm0, %rax
103; X64-AVX2-NEXT: retq
104;
105; X64-AVX512-LABEL: test_reduce_v2i64:
106; X64-AVX512: ## BB#0:
107; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
108; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
109; X64-AVX512-NEXT: vmovq %xmm0, %rax
110; X64-AVX512-NEXT: retq
111 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
112 %2 = icmp sgt <2 x i64> %a0, %1
113 %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
114 %4 = extractelement <2 x i64> %3, i32 0
115 ret i64 %4
116}
117
118define i32 @test_reduce_v4i32(<4 x i32> %a0) {
119; X86-SSE2-LABEL: test_reduce_v4i32:
120; X86-SSE2: ## BB#0:
121; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
122; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
123; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
124; X86-SSE2-NEXT: pand %xmm2, %xmm0
125; X86-SSE2-NEXT: pandn %xmm1, %xmm2
126; X86-SSE2-NEXT: por %xmm0, %xmm2
127; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
128; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
129; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
130; X86-SSE2-NEXT: pand %xmm1, %xmm2
131; X86-SSE2-NEXT: pandn %xmm0, %xmm1
132; X86-SSE2-NEXT: por %xmm2, %xmm1
133; X86-SSE2-NEXT: movd %xmm1, %eax
134; X86-SSE2-NEXT: retl
135;
136; X86-SSE42-LABEL: test_reduce_v4i32:
137; X86-SSE42: ## BB#0:
138; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
139; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
140; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
141; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
142; X86-SSE42-NEXT: movd %xmm0, %eax
143; X86-SSE42-NEXT: retl
144;
145; X86-AVX-LABEL: test_reduce_v4i32:
146; X86-AVX: ## BB#0:
147; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
148; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
149; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
150; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
151; X86-AVX-NEXT: vmovd %xmm0, %eax
152; X86-AVX-NEXT: retl
153;
154; X64-SSE2-LABEL: test_reduce_v4i32:
155; X64-SSE2: ## BB#0:
156; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
157; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
158; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
159; X64-SSE2-NEXT: pand %xmm2, %xmm0
160; X64-SSE2-NEXT: pandn %xmm1, %xmm2
161; X64-SSE2-NEXT: por %xmm0, %xmm2
162; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
163; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
164; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
165; X64-SSE2-NEXT: pand %xmm1, %xmm2
166; X64-SSE2-NEXT: pandn %xmm0, %xmm1
167; X64-SSE2-NEXT: por %xmm2, %xmm1
168; X64-SSE2-NEXT: movd %xmm1, %eax
169; X64-SSE2-NEXT: retq
170;
171; X64-SSE42-LABEL: test_reduce_v4i32:
172; X64-SSE42: ## BB#0:
173; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
174; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
175; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
176; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
177; X64-SSE42-NEXT: movd %xmm0, %eax
178; X64-SSE42-NEXT: retq
179;
180; X64-AVX-LABEL: test_reduce_v4i32:
181; X64-AVX: ## BB#0:
182; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
183; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
184; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
185; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
186; X64-AVX-NEXT: vmovd %xmm0, %eax
187; X64-AVX-NEXT: retq
188 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
189 %2 = icmp sgt <4 x i32> %a0, %1
190 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
191 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
192 %5 = icmp sgt <4 x i32> %3, %4
193 %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
194 %7 = extractelement <4 x i32> %6, i32 0
195 ret i32 %7
196}
197
198define i16 @test_reduce_v8i16(<8 x i16> %a0) {
199; X86-SSE-LABEL: test_reduce_v8i16:
200; X86-SSE: ## BB#0:
201; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
202; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
203; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
204; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
205; X86-SSE-NEXT: movdqa %xmm0, %xmm1
206; X86-SSE-NEXT: psrld $16, %xmm1
207; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
208; X86-SSE-NEXT: movd %xmm1, %eax
209; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
210; X86-SSE-NEXT: retl
211;
212; X86-AVX-LABEL: test_reduce_v8i16:
213; X86-AVX: ## BB#0:
214; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
215; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
216; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
217; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
218; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
219; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
220; X86-AVX-NEXT: vmovd %xmm0, %eax
221; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
222; X86-AVX-NEXT: retl
223;
224; X64-SSE-LABEL: test_reduce_v8i16:
225; X64-SSE: ## BB#0:
226; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
227; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
228; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
229; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
230; X64-SSE-NEXT: movdqa %xmm0, %xmm1
231; X64-SSE-NEXT: psrld $16, %xmm1
232; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
233; X64-SSE-NEXT: movd %xmm1, %eax
234; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
235; X64-SSE-NEXT: retq
236;
237; X64-AVX-LABEL: test_reduce_v8i16:
238; X64-AVX: ## BB#0:
239; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
240; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
241; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
242; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
243; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
244; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
245; X64-AVX-NEXT: vmovd %xmm0, %eax
246; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
247; X64-AVX-NEXT: retq
248 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
249 %2 = icmp sgt <8 x i16> %a0, %1
250 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
251 %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
252 %5 = icmp sgt <8 x i16> %3, %4
253 %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
254 %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
255 %8 = icmp sgt <8 x i16> %6, %7
256 %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
257 %10 = extractelement <8 x i16> %9, i32 0
258 ret i16 %10
259}
260
261define i8 @test_reduce_v16i8(<16 x i8> %a0) {
262; X86-SSE2-LABEL: test_reduce_v16i8:
263; X86-SSE2: ## BB#0:
264; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
265; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
266; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
267; X86-SSE2-NEXT: pand %xmm2, %xmm0
268; X86-SSE2-NEXT: pandn %xmm1, %xmm2
269; X86-SSE2-NEXT: por %xmm0, %xmm2
270; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
271; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
272; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
273; X86-SSE2-NEXT: pand %xmm1, %xmm2
274; X86-SSE2-NEXT: pandn %xmm0, %xmm1
275; X86-SSE2-NEXT: por %xmm2, %xmm1
276; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
277; X86-SSE2-NEXT: psrld $16, %xmm0
278; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
279; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
280; X86-SSE2-NEXT: pand %xmm2, %xmm1
281; X86-SSE2-NEXT: pandn %xmm0, %xmm2
282; X86-SSE2-NEXT: por %xmm1, %xmm2
283; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
284; X86-SSE2-NEXT: psrlw $8, %xmm0
285; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
286; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
287; X86-SSE2-NEXT: pand %xmm1, %xmm2
288; X86-SSE2-NEXT: pandn %xmm0, %xmm1
289; X86-SSE2-NEXT: por %xmm2, %xmm1
290; X86-SSE2-NEXT: movd %xmm1, %eax
291; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
292; X86-SSE2-NEXT: retl
293;
294; X86-SSE42-LABEL: test_reduce_v16i8:
295; X86-SSE42: ## BB#0:
296; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
297; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
298; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
299; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
300; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
301; X86-SSE42-NEXT: psrld $16, %xmm1
302; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
303; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
304; X86-SSE42-NEXT: psrlw $8, %xmm0
305; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
306; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
307; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
308; X86-SSE42-NEXT: retl
309;
310; X86-AVX-LABEL: test_reduce_v16i8:
311; X86-AVX: ## BB#0:
312; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
313; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
314; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
315; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
316; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
317; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
318; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
319; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
320; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
321; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
322; X86-AVX-NEXT: retl
323;
324; X64-SSE2-LABEL: test_reduce_v16i8:
325; X64-SSE2: ## BB#0:
326; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
327; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
328; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
329; X64-SSE2-NEXT: pand %xmm2, %xmm0
330; X64-SSE2-NEXT: pandn %xmm1, %xmm2
331; X64-SSE2-NEXT: por %xmm0, %xmm2
332; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
333; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
334; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
335; X64-SSE2-NEXT: pand %xmm1, %xmm2
336; X64-SSE2-NEXT: pandn %xmm0, %xmm1
337; X64-SSE2-NEXT: por %xmm2, %xmm1
338; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
339; X64-SSE2-NEXT: psrld $16, %xmm0
340; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
341; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
342; X64-SSE2-NEXT: pand %xmm2, %xmm1
343; X64-SSE2-NEXT: pandn %xmm0, %xmm2
344; X64-SSE2-NEXT: por %xmm1, %xmm2
345; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
346; X64-SSE2-NEXT: psrlw $8, %xmm0
347; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
348; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
349; X64-SSE2-NEXT: pand %xmm1, %xmm2
350; X64-SSE2-NEXT: pandn %xmm0, %xmm1
351; X64-SSE2-NEXT: por %xmm2, %xmm1
352; X64-SSE2-NEXT: movd %xmm1, %eax
353; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
354; X64-SSE2-NEXT: retq
355;
356; X64-SSE42-LABEL: test_reduce_v16i8:
357; X64-SSE42: ## BB#0:
358; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
359; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
360; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
361; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
362; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
363; X64-SSE42-NEXT: psrld $16, %xmm1
364; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
365; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
366; X64-SSE42-NEXT: psrlw $8, %xmm0
367; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
368; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
369; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
370; X64-SSE42-NEXT: retq
371;
372; X64-AVX-LABEL: test_reduce_v16i8:
373; X64-AVX: ## BB#0:
374; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
375; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
376; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
377; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
378; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
379; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
380; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
381; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
382; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
383; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
384; X64-AVX-NEXT: retq
385 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
386 %2 = icmp sgt <16 x i8> %a0, %1
387 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
388 %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
389 %5 = icmp sgt <16 x i8> %3, %4
390 %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
391 %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
392 %8 = icmp sgt <16 x i8> %6, %7
393 %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
394 %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
395 %11 = icmp sgt <16 x i8> %9, %10
396 %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
397 %13 = extractelement <16 x i8> %12, i32 0
398 ret i8 %13
399}
400
401;
402; 256-bit Vectors
403;
404
405define i64 @test_reduce_v4i64(<4 x i64> %a0) {
406; X86-SSE2-LABEL: test_reduce_v4i64:
407; X86-SSE2: ## BB#0:
408; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
409; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
410; X86-SSE2-NEXT: pxor %xmm2, %xmm3
411; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
412; X86-SSE2-NEXT: pxor %xmm2, %xmm4
413; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
414; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
415; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
416; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
417; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
418; X86-SSE2-NEXT: pand %xmm6, %xmm3
419; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
420; X86-SSE2-NEXT: por %xmm3, %xmm4
421; X86-SSE2-NEXT: pand %xmm4, %xmm0
422; X86-SSE2-NEXT: pandn %xmm1, %xmm4
423; X86-SSE2-NEXT: por %xmm0, %xmm4
424; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
425; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
426; X86-SSE2-NEXT: pxor %xmm2, %xmm1
427; X86-SSE2-NEXT: pxor %xmm0, %xmm2
428; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
429; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
430; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
431; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
432; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
433; X86-SSE2-NEXT: pand %xmm5, %xmm1
434; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
435; X86-SSE2-NEXT: por %xmm1, %xmm2
436; X86-SSE2-NEXT: pand %xmm2, %xmm4
437; X86-SSE2-NEXT: pandn %xmm0, %xmm2
438; X86-SSE2-NEXT: por %xmm4, %xmm2
439; X86-SSE2-NEXT: movd %xmm2, %eax
440; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
441; X86-SSE2-NEXT: movd %xmm0, %edx
442; X86-SSE2-NEXT: retl
443;
444; X86-SSE42-LABEL: test_reduce_v4i64:
445; X86-SSE42: ## BB#0:
446; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
447; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
448; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
449; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
450; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
451; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
452; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
453; X86-SSE42-NEXT: movd %xmm2, %eax
454; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
455; X86-SSE42-NEXT: retl
456;
457; X86-AVX1-LABEL: test_reduce_v4i64:
458; X86-AVX1: ## BB#0:
459; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
460; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
461; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
462; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
463; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
464; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
465; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
466; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
467; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
468; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
469; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
470; X86-AVX1-NEXT: vmovd %xmm0, %eax
471; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
472; X86-AVX1-NEXT: vzeroupper
473; X86-AVX1-NEXT: retl
474;
475; X86-AVX2-LABEL: test_reduce_v4i64:
476; X86-AVX2: ## BB#0:
477; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
478; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
479; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
480; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
481; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
482; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
483; X86-AVX2-NEXT: vmovd %xmm0, %eax
484; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
485; X86-AVX2-NEXT: vzeroupper
486; X86-AVX2-NEXT: retl
487;
488; X64-SSE2-LABEL: test_reduce_v4i64:
489; X64-SSE2: ## BB#0:
490; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
491; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
492; X64-SSE2-NEXT: pxor %xmm2, %xmm3
493; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
494; X64-SSE2-NEXT: pxor %xmm2, %xmm4
495; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
496; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
497; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
498; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
499; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
500; X64-SSE2-NEXT: pand %xmm6, %xmm3
501; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
502; X64-SSE2-NEXT: por %xmm3, %xmm4
503; X64-SSE2-NEXT: pand %xmm4, %xmm0
504; X64-SSE2-NEXT: pandn %xmm1, %xmm4
505; X64-SSE2-NEXT: por %xmm0, %xmm4
506; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
507; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
508; X64-SSE2-NEXT: pxor %xmm2, %xmm1
509; X64-SSE2-NEXT: pxor %xmm0, %xmm2
510; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
511; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
512; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
513; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
514; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
515; X64-SSE2-NEXT: pand %xmm5, %xmm1
516; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
517; X64-SSE2-NEXT: por %xmm1, %xmm2
518; X64-SSE2-NEXT: pand %xmm2, %xmm4
519; X64-SSE2-NEXT: pandn %xmm0, %xmm2
520; X64-SSE2-NEXT: por %xmm4, %xmm2
521; X64-SSE2-NEXT: movq %xmm2, %rax
522; X64-SSE2-NEXT: retq
523;
524; X64-SSE42-LABEL: test_reduce_v4i64:
525; X64-SSE42: ## BB#0:
526; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
527; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
528; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
529; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
530; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
531; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
532; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
533; X64-SSE42-NEXT: movq %xmm2, %rax
534; X64-SSE42-NEXT: retq
535;
536; X64-AVX1-LABEL: test_reduce_v4i64:
537; X64-AVX1: ## BB#0:
538; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
539; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
540; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
541; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
542; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
543; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
544; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
545; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
546; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
547; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
548; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
549; X64-AVX1-NEXT: vmovq %xmm0, %rax
550; X64-AVX1-NEXT: vzeroupper
551; X64-AVX1-NEXT: retq
552;
553; X64-AVX2-LABEL: test_reduce_v4i64:
554; X64-AVX2: ## BB#0:
555; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
556; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
557; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
558; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
559; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
560; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
561; X64-AVX2-NEXT: vmovq %xmm0, %rax
562; X64-AVX2-NEXT: vzeroupper
563; X64-AVX2-NEXT: retq
564;
565; X64-AVX512-LABEL: test_reduce_v4i64:
566; X64-AVX512: ## BB#0:
567; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
568; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
569; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
570; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
571; X64-AVX512-NEXT: vmovq %xmm0, %rax
572; X64-AVX512-NEXT: vzeroupper
573; X64-AVX512-NEXT: retq
574 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
575 %2 = icmp sgt <4 x i64> %a0, %1
576 %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
577 %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
578 %5 = icmp sgt <4 x i64> %3, %4
579 %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
580 %7 = extractelement <4 x i64> %6, i32 0
581 ret i64 %7
582}
583
584define i32 @test_reduce_v8i32(<8 x i32> %a0) {
585; X86-SSE2-LABEL: test_reduce_v8i32:
586; X86-SSE2: ## BB#0:
587; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
588; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
589; X86-SSE2-NEXT: pand %xmm2, %xmm0
590; X86-SSE2-NEXT: pandn %xmm1, %xmm2
591; X86-SSE2-NEXT: por %xmm0, %xmm2
592; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
593; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
594; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
595; X86-SSE2-NEXT: pand %xmm1, %xmm2
596; X86-SSE2-NEXT: pandn %xmm0, %xmm1
597; X86-SSE2-NEXT: por %xmm2, %xmm1
598; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
599; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
600; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
601; X86-SSE2-NEXT: pand %xmm2, %xmm1
602; X86-SSE2-NEXT: pandn %xmm0, %xmm2
603; X86-SSE2-NEXT: por %xmm1, %xmm2
604; X86-SSE2-NEXT: movd %xmm2, %eax
605; X86-SSE2-NEXT: retl
606;
607; X86-SSE42-LABEL: test_reduce_v8i32:
608; X86-SSE42: ## BB#0:
609; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
610; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
611; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
612; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
613; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
614; X86-SSE42-NEXT: movd %xmm0, %eax
615; X86-SSE42-NEXT: retl
616;
617; X86-AVX1-LABEL: test_reduce_v8i32:
618; X86-AVX1: ## BB#0:
619; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
620; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
621; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
622; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
623; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
624; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
625; X86-AVX1-NEXT: vmovd %xmm0, %eax
626; X86-AVX1-NEXT: vzeroupper
627; X86-AVX1-NEXT: retl
628;
629; X86-AVX2-LABEL: test_reduce_v8i32:
630; X86-AVX2: ## BB#0:
631; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
632; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
633; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
634; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
635; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
636; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
637; X86-AVX2-NEXT: vmovd %xmm0, %eax
638; X86-AVX2-NEXT: vzeroupper
639; X86-AVX2-NEXT: retl
640;
641; X64-SSE2-LABEL: test_reduce_v8i32:
642; X64-SSE2: ## BB#0:
643; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
644; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
645; X64-SSE2-NEXT: pand %xmm2, %xmm0
646; X64-SSE2-NEXT: pandn %xmm1, %xmm2
647; X64-SSE2-NEXT: por %xmm0, %xmm2
648; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
649; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
650; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
651; X64-SSE2-NEXT: pand %xmm1, %xmm2
652; X64-SSE2-NEXT: pandn %xmm0, %xmm1
653; X64-SSE2-NEXT: por %xmm2, %xmm1
654; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
655; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
656; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
657; X64-SSE2-NEXT: pand %xmm2, %xmm1
658; X64-SSE2-NEXT: pandn %xmm0, %xmm2
659; X64-SSE2-NEXT: por %xmm1, %xmm2
660; X64-SSE2-NEXT: movd %xmm2, %eax
661; X64-SSE2-NEXT: retq
662;
663; X64-SSE42-LABEL: test_reduce_v8i32:
664; X64-SSE42: ## BB#0:
665; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
666; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
667; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
668; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
669; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
670; X64-SSE42-NEXT: movd %xmm0, %eax
671; X64-SSE42-NEXT: retq
672;
673; X64-AVX1-LABEL: test_reduce_v8i32:
674; X64-AVX1: ## BB#0:
675; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
676; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
677; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
678; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
679; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
680; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
681; X64-AVX1-NEXT: vmovd %xmm0, %eax
682; X64-AVX1-NEXT: vzeroupper
683; X64-AVX1-NEXT: retq
684;
685; X64-AVX2-LABEL: test_reduce_v8i32:
686; X64-AVX2: ## BB#0:
687; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
688; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
689; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
690; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
691; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
692; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
693; X64-AVX2-NEXT: vmovd %xmm0, %eax
694; X64-AVX2-NEXT: vzeroupper
695; X64-AVX2-NEXT: retq
696;
697; X64-AVX512-LABEL: test_reduce_v8i32:
698; X64-AVX512: ## BB#0:
699; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
700; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
701; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
702; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
703; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
704; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
705; X64-AVX512-NEXT: vmovd %xmm0, %eax
706; X64-AVX512-NEXT: vzeroupper
707; X64-AVX512-NEXT: retq
708 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
709 %2 = icmp sgt <8 x i32> %a0, %1
710 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
711 %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
712 %5 = icmp sgt <8 x i32> %3, %4
713 %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
714 %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
715 %8 = icmp sgt <8 x i32> %6, %7
716 %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
717 %10 = extractelement <8 x i32> %9, i32 0
718 ret i32 %10
719}
720
721define i16 @test_reduce_v16i16(<16 x i16> %a0) {
722; X86-SSE-LABEL: test_reduce_v16i16:
723; X86-SSE: ## BB#0:
724; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
725; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
726; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
727; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
728; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
729; X86-SSE-NEXT: movdqa %xmm0, %xmm1
730; X86-SSE-NEXT: psrld $16, %xmm1
731; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
732; X86-SSE-NEXT: movd %xmm1, %eax
733; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
734; X86-SSE-NEXT: retl
735;
736; X86-AVX1-LABEL: test_reduce_v16i16:
737; X86-AVX1: ## BB#0:
738; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
739; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
740; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
741; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
742; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
743; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
744; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
745; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
746; X86-AVX1-NEXT: vmovd %xmm0, %eax
747; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
748; X86-AVX1-NEXT: vzeroupper
749; X86-AVX1-NEXT: retl
750;
751; X86-AVX2-LABEL: test_reduce_v16i16:
752; X86-AVX2: ## BB#0:
753; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
754; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
755; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
756; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
757; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
758; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
759; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
760; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
761; X86-AVX2-NEXT: vmovd %xmm0, %eax
762; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
763; X86-AVX2-NEXT: vzeroupper
764; X86-AVX2-NEXT: retl
765;
766; X64-SSE-LABEL: test_reduce_v16i16:
767; X64-SSE: ## BB#0:
768; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
769; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
770; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
771; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
772; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
773; X64-SSE-NEXT: movdqa %xmm0, %xmm1
774; X64-SSE-NEXT: psrld $16, %xmm1
775; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
776; X64-SSE-NEXT: movd %xmm1, %eax
777; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
778; X64-SSE-NEXT: retq
779;
780; X64-AVX1-LABEL: test_reduce_v16i16:
781; X64-AVX1: ## BB#0:
782; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
783; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
784; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
785; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
786; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
787; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
788; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
789; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
790; X64-AVX1-NEXT: vmovd %xmm0, %eax
791; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
792; X64-AVX1-NEXT: vzeroupper
793; X64-AVX1-NEXT: retq
794;
795; X64-AVX2-LABEL: test_reduce_v16i16:
796; X64-AVX2: ## BB#0:
797; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
798; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
799; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
800; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
801; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
802; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
803; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
804; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
805; X64-AVX2-NEXT: vmovd %xmm0, %eax
806; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
807; X64-AVX2-NEXT: vzeroupper
808; X64-AVX2-NEXT: retq
809;
810; X64-AVX512-LABEL: test_reduce_v16i16:
811; X64-AVX512: ## BB#0:
812; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
813; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
814; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
815; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
816; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
817; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
818; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
819; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
820; X64-AVX512-NEXT: vmovd %xmm0, %eax
821; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
822; X64-AVX512-NEXT: vzeroupper
823; X64-AVX512-NEXT: retq
824 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
825 %2 = icmp sgt <16 x i16> %a0, %1
826 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
827 %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
828 %5 = icmp sgt <16 x i16> %3, %4
829 %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
830 %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
831 %8 = icmp sgt <16 x i16> %6, %7
832 %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
833 %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
834 %11 = icmp sgt <16 x i16> %9, %10
835 %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
836 %13 = extractelement <16 x i16> %12, i32 0
837 ret i16 %13
838}
839
840define i8 @test_reduce_v32i8(<32 x i8> %a0) {
841; X86-SSE2-LABEL: test_reduce_v32i8:
842; X86-SSE2: ## BB#0:
843; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
844; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
845; X86-SSE2-NEXT: pand %xmm2, %xmm0
846; X86-SSE2-NEXT: pandn %xmm1, %xmm2
847; X86-SSE2-NEXT: por %xmm0, %xmm2
848; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
849; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
850; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
851; X86-SSE2-NEXT: pand %xmm1, %xmm2
852; X86-SSE2-NEXT: pandn %xmm0, %xmm1
853; X86-SSE2-NEXT: por %xmm2, %xmm1
854; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
855; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
856; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
857; X86-SSE2-NEXT: pand %xmm2, %xmm1
858; X86-SSE2-NEXT: pandn %xmm0, %xmm2
859; X86-SSE2-NEXT: por %xmm1, %xmm2
860; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
861; X86-SSE2-NEXT: psrld $16, %xmm0
862; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
863; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
864; X86-SSE2-NEXT: pand %xmm1, %xmm2
865; X86-SSE2-NEXT: pandn %xmm0, %xmm1
866; X86-SSE2-NEXT: por %xmm2, %xmm1
867; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
868; X86-SSE2-NEXT: psrlw $8, %xmm0
869; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
870; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
871; X86-SSE2-NEXT: pand %xmm2, %xmm1
872; X86-SSE2-NEXT: pandn %xmm0, %xmm2
873; X86-SSE2-NEXT: por %xmm1, %xmm2
874; X86-SSE2-NEXT: movd %xmm2, %eax
875; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
876; X86-SSE2-NEXT: retl
877;
878; X86-SSE42-LABEL: test_reduce_v32i8:
879; X86-SSE42: ## BB#0:
880; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
881; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
882; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
883; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
884; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
885; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
886; X86-SSE42-NEXT: psrld $16, %xmm1
887; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
888; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
889; X86-SSE42-NEXT: psrlw $8, %xmm0
890; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
891; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
892; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
893; X86-SSE42-NEXT: retl
894;
895; X86-AVX1-LABEL: test_reduce_v32i8:
896; X86-AVX1: ## BB#0:
897; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
898; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
899; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
900; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
901; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
902; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
903; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
904; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
905; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
906; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
907; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
908; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
909; X86-AVX1-NEXT: vzeroupper
910; X86-AVX1-NEXT: retl
911;
912; X86-AVX2-LABEL: test_reduce_v32i8:
913; X86-AVX2: ## BB#0:
914; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
915; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
916; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
917; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
918; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
919; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
920; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
921; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
922; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
923; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
924; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
925; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
926; X86-AVX2-NEXT: vzeroupper
927; X86-AVX2-NEXT: retl
928;
929; X64-SSE2-LABEL: test_reduce_v32i8:
930; X64-SSE2: ## BB#0:
931; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
932; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
933; X64-SSE2-NEXT: pand %xmm2, %xmm0
934; X64-SSE2-NEXT: pandn %xmm1, %xmm2
935; X64-SSE2-NEXT: por %xmm0, %xmm2
936; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
937; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
938; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
939; X64-SSE2-NEXT: pand %xmm1, %xmm2
940; X64-SSE2-NEXT: pandn %xmm0, %xmm1
941; X64-SSE2-NEXT: por %xmm2, %xmm1
942; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
943; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
944; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
945; X64-SSE2-NEXT: pand %xmm2, %xmm1
946; X64-SSE2-NEXT: pandn %xmm0, %xmm2
947; X64-SSE2-NEXT: por %xmm1, %xmm2
948; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
949; X64-SSE2-NEXT: psrld $16, %xmm0
950; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
951; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
952; X64-SSE2-NEXT: pand %xmm1, %xmm2
953; X64-SSE2-NEXT: pandn %xmm0, %xmm1
954; X64-SSE2-NEXT: por %xmm2, %xmm1
955; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
956; X64-SSE2-NEXT: psrlw $8, %xmm0
957; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
958; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
959; X64-SSE2-NEXT: pand %xmm2, %xmm1
960; X64-SSE2-NEXT: pandn %xmm0, %xmm2
961; X64-SSE2-NEXT: por %xmm1, %xmm2
962; X64-SSE2-NEXT: movd %xmm2, %eax
963; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
964; X64-SSE2-NEXT: retq
965;
966; X64-SSE42-LABEL: test_reduce_v32i8:
967; X64-SSE42: ## BB#0:
968; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
969; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
970; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
971; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
972; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
973; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
974; X64-SSE42-NEXT: psrld $16, %xmm1
975; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
976; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
977; X64-SSE42-NEXT: psrlw $8, %xmm0
978; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
979; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
980; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
981; X64-SSE42-NEXT: retq
982;
983; X64-AVX1-LABEL: test_reduce_v32i8:
984; X64-AVX1: ## BB#0:
985; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
986; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
987; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
988; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
989; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
990; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
991; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
992; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
993; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
994; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
995; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
996; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
997; X64-AVX1-NEXT: vzeroupper
998; X64-AVX1-NEXT: retq
999;
1000; X64-AVX2-LABEL: test_reduce_v32i8:
1001; X64-AVX2: ## BB#0:
1002; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1003; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1004; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1005; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1006; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1007; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1008; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1009; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1010; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1011; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1012; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
1013; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1014; X64-AVX2-NEXT: vzeroupper
1015; X64-AVX2-NEXT: retq
1016;
1017; X64-AVX512-LABEL: test_reduce_v32i8:
1018; X64-AVX512: ## BB#0:
1019; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1020; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1021; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1022; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1023; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1024; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1025; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1026; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1027; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1028; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1029; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
1030; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1031; X64-AVX512-NEXT: vzeroupper
1032; X64-AVX512-NEXT: retq
1033 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1034 %2 = icmp sgt <32 x i8> %a0, %1
1035 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
1036 %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1037 %5 = icmp sgt <32 x i8> %3, %4
1038 %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
1039 %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1040 %8 = icmp sgt <32 x i8> %6, %7
1041 %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
1042 %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1043 %11 = icmp sgt <32 x i8> %9, %10
1044 %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
1045 %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1046 %14 = icmp sgt <32 x i8> %12, %13
1047 %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
1048 %16 = extractelement <32 x i8> %15, i32 0
1049 ret i8 %16
1050}
1051
1052;
1053; 512-bit Vectors
1054;
1055
1056define i64 @test_reduce_v8i64(<8 x i64> %a0) {
1057; X86-SSE2-LABEL: test_reduce_v8i64:
1058; X86-SSE2: ## BB#0:
1059; X86-SSE2-NEXT: subl $28, %esp
1060; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
1061; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
1062; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill
1063; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
1064; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1065; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
1066; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
1067; X86-SSE2-NEXT: pxor %xmm4, %xmm5
1068; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
1069; X86-SSE2-NEXT: pxor %xmm4, %xmm6
1070; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
1071; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
1072; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
1073; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
1074; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1075; X86-SSE2-NEXT: pand %xmm5, %xmm6
1076; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
1077; X86-SSE2-NEXT: por %xmm6, %xmm5
1078; X86-SSE2-NEXT: movdqa %xmm3, %xmm6
1079; X86-SSE2-NEXT: pxor %xmm4, %xmm6
1080; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
1081; X86-SSE2-NEXT: pxor %xmm4, %xmm7
1082; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
1083; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
1084; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
1085; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
1086; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
1087; X86-SSE2-NEXT: pand %xmm6, %xmm7
1088; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
1089; X86-SSE2-NEXT: por %xmm7, %xmm6
1090; X86-SSE2-NEXT: pand %xmm6, %xmm1
1091; X86-SSE2-NEXT: pandn %xmm3, %xmm6
1092; X86-SSE2-NEXT: por %xmm1, %xmm6
1093; X86-SSE2-NEXT: pand %xmm5, %xmm2
1094; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
1095; X86-SSE2-NEXT: por %xmm2, %xmm5
1096; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
1097; X86-SSE2-NEXT: pxor %xmm4, %xmm0
1098; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
1099; X86-SSE2-NEXT: pxor %xmm4, %xmm1
1100; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1101; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
1102; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
1103; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
1104; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1105; X86-SSE2-NEXT: pand %xmm0, %xmm1
1106; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
1107; X86-SSE2-NEXT: por %xmm1, %xmm0
1108; X86-SSE2-NEXT: pand %xmm0, %xmm6
1109; X86-SSE2-NEXT: pandn %xmm5, %xmm0
1110; X86-SSE2-NEXT: por %xmm6, %xmm0
1111; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1112; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
1113; X86-SSE2-NEXT: pxor %xmm4, %xmm2
1114; X86-SSE2-NEXT: pxor %xmm1, %xmm4
1115; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
1116; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
1117; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
1118; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
1119; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1120; X86-SSE2-NEXT: pand %xmm2, %xmm4
1121; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
1122; X86-SSE2-NEXT: por %xmm4, %xmm2
1123; X86-SSE2-NEXT: pand %xmm2, %xmm0
1124; X86-SSE2-NEXT: pandn %xmm1, %xmm2
1125; X86-SSE2-NEXT: por %xmm0, %xmm2
1126; X86-SSE2-NEXT: movd %xmm2, %eax
1127; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
1128; X86-SSE2-NEXT: movd %xmm0, %edx
1129; X86-SSE2-NEXT: addl $28, %esp
1130; X86-SSE2-NEXT: retl
1131;
1132; X86-SSE42-LABEL: test_reduce_v8i64:
1133; X86-SSE42: ## BB#0:
1134; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
1135; X86-SSE42-NEXT: movdqa %xmm4, %xmm5
1136; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm5
1137; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
1138; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
1139; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
1140; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
1141; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
1142; X86-SSE42-NEXT: movapd %xmm2, %xmm0
1143; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
1144; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
1145; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
1146; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
1147; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
1148; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
1149; X86-SSE42-NEXT: movd %xmm1, %eax
1150; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
1151; X86-SSE42-NEXT: retl
1152;
1153; X86-AVX1-LABEL: test_reduce_v8i64:
1154; X86-AVX1: ## BB#0:
1155; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1156; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1157; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1158; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
1159; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1160; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1161; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1162; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1163; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
1164; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1165; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1166; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1167; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1168; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1169; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
1170; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1171; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1172; X86-AVX1-NEXT: vmovd %xmm0, %eax
1173; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
1174; X86-AVX1-NEXT: vzeroupper
1175; X86-AVX1-NEXT: retl
1176;
1177; X86-AVX2-LABEL: test_reduce_v8i64:
1178; X86-AVX2: ## BB#0:
1179; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
1180; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1181; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1182; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
1183; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1184; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1185; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
1186; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1187; X86-AVX2-NEXT: vmovd %xmm0, %eax
1188; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
1189; X86-AVX2-NEXT: vzeroupper
1190; X86-AVX2-NEXT: retl
1191;
1192; X64-SSE2-LABEL: test_reduce_v8i64:
1193; X64-SSE2: ## BB#0:
1194; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
1195; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
1196; X64-SSE2-NEXT: pxor %xmm4, %xmm5
1197; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
1198; X64-SSE2-NEXT: pxor %xmm4, %xmm6
1199; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
1200; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
1201; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
1202; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
1203; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1204; X64-SSE2-NEXT: pand %xmm8, %xmm6
1205; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
1206; X64-SSE2-NEXT: por %xmm6, %xmm8
1207; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
1208; X64-SSE2-NEXT: pxor %xmm4, %xmm6
1209; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
1210; X64-SSE2-NEXT: pxor %xmm4, %xmm7
1211; X64-SSE2-NEXT: movdqa %xmm7, %xmm5
1212; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
1213; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
1214; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
1215; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
1216; X64-SSE2-NEXT: pand %xmm9, %xmm7
1217; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
1218; X64-SSE2-NEXT: por %xmm7, %xmm6
1219; X64-SSE2-NEXT: pand %xmm6, %xmm0
1220; X64-SSE2-NEXT: pandn %xmm2, %xmm6
1221; X64-SSE2-NEXT: por %xmm0, %xmm6
1222; X64-SSE2-NEXT: pand %xmm8, %xmm1
1223; X64-SSE2-NEXT: pandn %xmm3, %xmm8
1224; X64-SSE2-NEXT: por %xmm1, %xmm8
1225; X64-SSE2-NEXT: movdqa %xmm8, %xmm0
1226; X64-SSE2-NEXT: pxor %xmm4, %xmm0
1227; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
1228; X64-SSE2-NEXT: pxor %xmm4, %xmm1
1229; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
1230; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
1231; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
1232; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
1233; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
1234; X64-SSE2-NEXT: pand %xmm3, %xmm0
1235; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
1236; X64-SSE2-NEXT: por %xmm0, %xmm1
1237; X64-SSE2-NEXT: pand %xmm1, %xmm6
1238; X64-SSE2-NEXT: pandn %xmm8, %xmm1
1239; X64-SSE2-NEXT: por %xmm6, %xmm1
1240; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1241; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
1242; X64-SSE2-NEXT: pxor %xmm4, %xmm2
1243; X64-SSE2-NEXT: pxor %xmm0, %xmm4
1244; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
1245; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
1246; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
1247; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
1248; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1249; X64-SSE2-NEXT: pand %xmm5, %xmm2
1250; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1251; X64-SSE2-NEXT: por %xmm2, %xmm3
1252; X64-SSE2-NEXT: pand %xmm3, %xmm1
1253; X64-SSE2-NEXT: pandn %xmm0, %xmm3
1254; X64-SSE2-NEXT: por %xmm1, %xmm3
1255; X64-SSE2-NEXT: movq %xmm3, %rax
1256; X64-SSE2-NEXT: retq
1257;
1258; X64-SSE42-LABEL: test_reduce_v8i64:
1259; X64-SSE42: ## BB#0:
1260; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
1261; X64-SSE42-NEXT: movdqa %xmm4, %xmm5
1262; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm5
1263; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
1264; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
1265; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
1266; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
1267; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
1268; X64-SSE42-NEXT: movapd %xmm2, %xmm0
1269; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
1270; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
1271; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
1272; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
1273; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
1274; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
1275; X64-SSE42-NEXT: movq %xmm1, %rax
1276; X64-SSE42-NEXT: retq
1277;
1278; X64-AVX1-LABEL: test_reduce_v8i64:
1279; X64-AVX1: ## BB#0:
1280; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1281; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1282; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1283; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
1284; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1285; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1286; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1287; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1288; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
1289; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1290; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1291; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1292; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1293; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1294; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
1295; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1296; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1297; X64-AVX1-NEXT: vmovq %xmm0, %rax
1298; X64-AVX1-NEXT: vzeroupper
1299; X64-AVX1-NEXT: retq
1300;
1301; X64-AVX2-LABEL: test_reduce_v8i64:
1302; X64-AVX2: ## BB#0:
1303; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
1304; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1305; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1306; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
1307; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1308; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1309; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
1310; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1311; X64-AVX2-NEXT: vmovq %xmm0, %rax
1312; X64-AVX2-NEXT: vzeroupper
1313; X64-AVX2-NEXT: retq
1314;
1315; X64-AVX512-LABEL: test_reduce_v8i64:
1316; X64-AVX512: ## BB#0:
1317; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1318; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
1319; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1320; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
1321; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1322; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
1323; X64-AVX512-NEXT: vmovq %xmm0, %rax
1324; X64-AVX512-NEXT: vzeroupper
1325; X64-AVX512-NEXT: retq
1326 %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1327 %2 = icmp sgt <8 x i64> %a0, %1
1328 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
1329 %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1330 %5 = icmp sgt <8 x i64> %3, %4
1331 %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
1332 %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1333 %8 = icmp sgt <8 x i64> %6, %7
1334 %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
1335 %10 = extractelement <8 x i64> %9, i32 0
1336 ret i64 %10
1337}
1338
1339define i32 @test_reduce_v16i32(<16 x i32> %a0) {
1340; X86-SSE2-LABEL: test_reduce_v16i32:
1341; X86-SSE2: ## BB#0:
1342; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
1343; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
1344; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
1345; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1346; X86-SSE2-NEXT: pand %xmm5, %xmm1
1347; X86-SSE2-NEXT: pandn %xmm3, %xmm5
1348; X86-SSE2-NEXT: por %xmm1, %xmm5
1349; X86-SSE2-NEXT: pand %xmm4, %xmm0
1350; X86-SSE2-NEXT: pandn %xmm2, %xmm4
1351; X86-SSE2-NEXT: por %xmm0, %xmm4
1352; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
1353; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
1354; X86-SSE2-NEXT: pand %xmm0, %xmm4
1355; X86-SSE2-NEXT: pandn %xmm5, %xmm0
1356; X86-SSE2-NEXT: por %xmm4, %xmm0
1357; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1358; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
1359; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
1360; X86-SSE2-NEXT: pand %xmm2, %xmm0
1361; X86-SSE2-NEXT: pandn %xmm1, %xmm2
1362; X86-SSE2-NEXT: por %xmm0, %xmm2
1363; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
1364; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
1365; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
1366; X86-SSE2-NEXT: pand %xmm1, %xmm2
1367; X86-SSE2-NEXT: pandn %xmm0, %xmm1
1368; X86-SSE2-NEXT: por %xmm2, %xmm1
1369; X86-SSE2-NEXT: movd %xmm1, %eax
1370; X86-SSE2-NEXT: retl
1371;
1372; X86-SSE42-LABEL: test_reduce_v16i32:
1373; X86-SSE42: ## BB#0:
1374; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1
1375; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm0
1376; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
1377; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1378; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
1379; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1380; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
1381; X86-SSE42-NEXT: movd %xmm0, %eax
1382; X86-SSE42-NEXT: retl
1383;
1384; X86-AVX1-LABEL: test_reduce_v16i32:
1385; X86-AVX1: ## BB#0:
1386; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1387; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1388; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
1389; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1390; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
1391; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1392; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1393; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1394; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1395; X86-AVX1-NEXT: vmovd %xmm0, %eax
1396; X86-AVX1-NEXT: vzeroupper
1397; X86-AVX1-NEXT: retl
1398;
1399; X86-AVX2-LABEL: test_reduce_v16i32:
1400; X86-AVX2: ## BB#0:
1401; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1402; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1403; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1404; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1405; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1406; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1407; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1408; X86-AVX2-NEXT: vmovd %xmm0, %eax
1409; X86-AVX2-NEXT: vzeroupper
1410; X86-AVX2-NEXT: retl
1411;
1412; X64-SSE2-LABEL: test_reduce_v16i32:
1413; X64-SSE2: ## BB#0:
1414; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
1415; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
1416; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
1417; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1418; X64-SSE2-NEXT: pand %xmm5, %xmm1
1419; X64-SSE2-NEXT: pandn %xmm3, %xmm5
1420; X64-SSE2-NEXT: por %xmm1, %xmm5
1421; X64-SSE2-NEXT: pand %xmm4, %xmm0
1422; X64-SSE2-NEXT: pandn %xmm2, %xmm4
1423; X64-SSE2-NEXT: por %xmm0, %xmm4
1424; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
1425; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
1426; X64-SSE2-NEXT: pand %xmm0, %xmm4
1427; X64-SSE2-NEXT: pandn %xmm5, %xmm0
1428; X64-SSE2-NEXT: por %xmm4, %xmm0
1429; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1430; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
1431; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
1432; X64-SSE2-NEXT: pand %xmm2, %xmm0
1433; X64-SSE2-NEXT: pandn %xmm1, %xmm2
1434; X64-SSE2-NEXT: por %xmm0, %xmm2
1435; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
1436; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
1437; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
1438; X64-SSE2-NEXT: pand %xmm1, %xmm2
1439; X64-SSE2-NEXT: pandn %xmm0, %xmm1
1440; X64-SSE2-NEXT: por %xmm2, %xmm1
1441; X64-SSE2-NEXT: movd %xmm1, %eax
1442; X64-SSE2-NEXT: retq
1443;
1444; X64-SSE42-LABEL: test_reduce_v16i32:
1445; X64-SSE42: ## BB#0:
1446; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1
1447; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm0
1448; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
1449; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1450; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
1451; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1452; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
1453; X64-SSE42-NEXT: movd %xmm0, %eax
1454; X64-SSE42-NEXT: retq
1455;
1456; X64-AVX1-LABEL: test_reduce_v16i32:
1457; X64-AVX1: ## BB#0:
1458; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1459; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1460; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
1461; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1462; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
1463; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1464; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1465; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1466; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1467; X64-AVX1-NEXT: vmovd %xmm0, %eax
1468; X64-AVX1-NEXT: vzeroupper
1469; X64-AVX1-NEXT: retq
1470;
1471; X64-AVX2-LABEL: test_reduce_v16i32:
1472; X64-AVX2: ## BB#0:
1473; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1474; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1475; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1476; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1477; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1478; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1479; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1480; X64-AVX2-NEXT: vmovd %xmm0, %eax
1481; X64-AVX2-NEXT: vzeroupper
1482; X64-AVX2-NEXT: retq
1483;
1484; X64-AVX512-LABEL: test_reduce_v16i32:
1485; X64-AVX512: ## BB#0:
1486; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1487; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
1488; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1489; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
1490; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1491; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
1492; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1493; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
1494; X64-AVX512-NEXT: vmovd %xmm0, %eax
1495; X64-AVX512-NEXT: vzeroupper
1496; X64-AVX512-NEXT: retq
1497 %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1498 %2 = icmp sgt <16 x i32> %a0, %1
1499 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
1500 %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1501 %5 = icmp sgt <16 x i32> %3, %4
1502 %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
1503 %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1504 %8 = icmp sgt <16 x i32> %6, %7
1505 %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
1506 %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1507 %11 = icmp sgt <16 x i32> %9, %10
1508 %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
1509 %13 = extractelement <16 x i32> %12, i32 0
1510 ret i32 %13
1511}
1512
1513define i16 @test_reduce_v32i16(<32 x i16> %a0) {
1514; X86-SSE-LABEL: test_reduce_v32i16:
1515; X86-SSE: ## BB#0:
1516; X86-SSE-NEXT: pmaxsw %xmm3, %xmm1
1517; X86-SSE-NEXT: pmaxsw %xmm2, %xmm0
1518; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
1519; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1520; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
1521; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1522; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
1523; X86-SSE-NEXT: movdqa %xmm0, %xmm1
1524; X86-SSE-NEXT: psrld $16, %xmm1
1525; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
1526; X86-SSE-NEXT: movd %xmm1, %eax
1527; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1528; X86-SSE-NEXT: retl
1529;
1530; X86-AVX1-LABEL: test_reduce_v32i16:
1531; X86-AVX1: ## BB#0:
1532; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1533; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1534; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
1535; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1536; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
1537; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1538; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1539; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1540; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1541; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1542; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1543; X86-AVX1-NEXT: vmovd %xmm0, %eax
1544; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1545; X86-AVX1-NEXT: vzeroupper
1546; X86-AVX1-NEXT: retl
1547;
1548; X86-AVX2-LABEL: test_reduce_v32i16:
1549; X86-AVX2: ## BB#0:
1550; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1551; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1552; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1553; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1554; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1555; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1556; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1557; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1558; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1559; X86-AVX2-NEXT: vmovd %xmm0, %eax
1560; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1561; X86-AVX2-NEXT: vzeroupper
1562; X86-AVX2-NEXT: retl
1563;
1564; X64-SSE-LABEL: test_reduce_v32i16:
1565; X64-SSE: ## BB#0:
1566; X64-SSE-NEXT: pmaxsw %xmm3, %xmm1
1567; X64-SSE-NEXT: pmaxsw %xmm2, %xmm0
1568; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
1569; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1570; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
1571; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1572; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
1573; X64-SSE-NEXT: movdqa %xmm0, %xmm1
1574; X64-SSE-NEXT: psrld $16, %xmm1
1575; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
1576; X64-SSE-NEXT: movd %xmm1, %eax
1577; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1578; X64-SSE-NEXT: retq
1579;
1580; X64-AVX1-LABEL: test_reduce_v32i16:
1581; X64-AVX1: ## BB#0:
1582; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1583; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1584; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
1585; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1586; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
1587; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1588; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1589; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1590; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1591; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1592; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1593; X64-AVX1-NEXT: vmovd %xmm0, %eax
1594; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1595; X64-AVX1-NEXT: vzeroupper
1596; X64-AVX1-NEXT: retq
1597;
1598; X64-AVX2-LABEL: test_reduce_v32i16:
1599; X64-AVX2: ## BB#0:
1600; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1601; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1602; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1603; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1604; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1605; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1606; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1607; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1608; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1609; X64-AVX2-NEXT: vmovd %xmm0, %eax
1610; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1611; X64-AVX2-NEXT: vzeroupper
1612; X64-AVX2-NEXT: retq
1613;
1614; X64-AVX512-LABEL: test_reduce_v32i16:
1615; X64-AVX512: ## BB#0:
1616; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1617; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
1618; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1619; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
1620; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1621; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
1622; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1623; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
1624; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1625; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
1626; X64-AVX512-NEXT: vmovd %xmm0, %eax
1627; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1628; X64-AVX512-NEXT: vzeroupper
1629; X64-AVX512-NEXT: retq
1630 %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1631 %2 = icmp sgt <32 x i16> %a0, %1
1632 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
1633 %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1634 %5 = icmp sgt <32 x i16> %3, %4
1635 %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
1636 %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1637 %8 = icmp sgt <32 x i16> %6, %7
1638 %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
1639 %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1640 %11 = icmp sgt <32 x i16> %9, %10
1641 %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
1642 %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1643 %14 = icmp sgt <32 x i16> %12, %13
1644 %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
1645 %16 = extractelement <32 x i16> %15, i32 0
1646 ret i16 %16
1647}
1648
1649define i8 @test_reduce_v64i8(<64 x i8> %a0) {
1650; X86-SSE2-LABEL: test_reduce_v64i8:
1651; X86-SSE2: ## BB#0:
1652; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
1653; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
1654; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
1655; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm5
1656; X86-SSE2-NEXT: pand %xmm5, %xmm1
1657; X86-SSE2-NEXT: pandn %xmm3, %xmm5
1658; X86-SSE2-NEXT: por %xmm1, %xmm5
1659; X86-SSE2-NEXT: pand %xmm4, %xmm0
1660; X86-SSE2-NEXT: pandn %xmm2, %xmm4
1661; X86-SSE2-NEXT: por %xmm0, %xmm4
1662; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
1663; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
1664; X86-SSE2-NEXT: pand %xmm0, %xmm4
1665; X86-SSE2-NEXT: pandn %xmm5, %xmm0
1666; X86-SSE2-NEXT: por %xmm4, %xmm0
1667; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1668; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
1669; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
1670; X86-SSE2-NEXT: pand %xmm2, %xmm0
1671; X86-SSE2-NEXT: pandn %xmm1, %xmm2
1672; X86-SSE2-NEXT: por %xmm0, %xmm2
1673; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
1674; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
1675; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
1676; X86-SSE2-NEXT: pand %xmm1, %xmm2
1677; X86-SSE2-NEXT: pandn %xmm0, %xmm1
1678; X86-SSE2-NEXT: por %xmm2, %xmm1
1679; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
1680; X86-SSE2-NEXT: psrld $16, %xmm0
1681; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1682; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
1683; X86-SSE2-NEXT: pand %xmm2, %xmm1
1684; X86-SSE2-NEXT: pandn %xmm0, %xmm2
1685; X86-SSE2-NEXT: por %xmm1, %xmm2
1686; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
1687; X86-SSE2-NEXT: psrlw $8, %xmm0
1688; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
1689; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
1690; X86-SSE2-NEXT: pand %xmm1, %xmm2
1691; X86-SSE2-NEXT: pandn %xmm0, %xmm1
1692; X86-SSE2-NEXT: por %xmm2, %xmm1
1693; X86-SSE2-NEXT: movd %xmm1, %eax
1694; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1695; X86-SSE2-NEXT: retl
1696;
1697; X86-SSE42-LABEL: test_reduce_v64i8:
1698; X86-SSE42: ## BB#0:
1699; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1
1700; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0
1701; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1702; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1703; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
1704; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1705; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1706; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
1707; X86-SSE42-NEXT: psrld $16, %xmm1
1708; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
1709; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
1710; X86-SSE42-NEXT: psrlw $8, %xmm0
1711; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1712; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
1713; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1714; X86-SSE42-NEXT: retl
1715;
1716; X86-AVX1-LABEL: test_reduce_v64i8:
1717; X86-AVX1: ## BB#0:
1718; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1719; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1720; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
1721; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1722; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
1723; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1724; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1725; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1726; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1727; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1728; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1729; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1730; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1731; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
1732; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1733; X86-AVX1-NEXT: vzeroupper
1734; X86-AVX1-NEXT: retl
1735;
1736; X86-AVX2-LABEL: test_reduce_v64i8:
1737; X86-AVX2: ## BB#0:
1738; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1739; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1740; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1741; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1742; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1743; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1744; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1745; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1746; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1747; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1748; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1749; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
1750; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1751; X86-AVX2-NEXT: vzeroupper
1752; X86-AVX2-NEXT: retl
1753;
1754; X64-SSE2-LABEL: test_reduce_v64i8:
1755; X64-SSE2: ## BB#0:
1756; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
1757; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
1758; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
1759; X64-SSE2-NEXT: pcmpgtb %xmm3, %xmm5
1760; X64-SSE2-NEXT: pand %xmm5, %xmm1
1761; X64-SSE2-NEXT: pandn %xmm3, %xmm5
1762; X64-SSE2-NEXT: por %xmm1, %xmm5
1763; X64-SSE2-NEXT: pand %xmm4, %xmm0
1764; X64-SSE2-NEXT: pandn %xmm2, %xmm4
1765; X64-SSE2-NEXT: por %xmm0, %xmm4
1766; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
1767; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
1768; X64-SSE2-NEXT: pand %xmm0, %xmm4
1769; X64-SSE2-NEXT: pandn %xmm5, %xmm0
1770; X64-SSE2-NEXT: por %xmm4, %xmm0
1771; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1772; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
1773; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
1774; X64-SSE2-NEXT: pand %xmm2, %xmm0
1775; X64-SSE2-NEXT: pandn %xmm1, %xmm2
1776; X64-SSE2-NEXT: por %xmm0, %xmm2
1777; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
1778; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
1779; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
1780; X64-SSE2-NEXT: pand %xmm1, %xmm2
1781; X64-SSE2-NEXT: pandn %xmm0, %xmm1
1782; X64-SSE2-NEXT: por %xmm2, %xmm1
1783; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
1784; X64-SSE2-NEXT: psrld $16, %xmm0
1785; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
1786; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
1787; X64-SSE2-NEXT: pand %xmm2, %xmm1
1788; X64-SSE2-NEXT: pandn %xmm0, %xmm2
1789; X64-SSE2-NEXT: por %xmm1, %xmm2
1790; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
1791; X64-SSE2-NEXT: psrlw $8, %xmm0
1792; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
1793; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
1794; X64-SSE2-NEXT: pand %xmm1, %xmm2
1795; X64-SSE2-NEXT: pandn %xmm0, %xmm1
1796; X64-SSE2-NEXT: por %xmm2, %xmm1
1797; X64-SSE2-NEXT: movd %xmm1, %eax
1798; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1799; X64-SSE2-NEXT: retq
1800;
1801; X64-SSE42-LABEL: test_reduce_v64i8:
1802; X64-SSE42: ## BB#0:
1803; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1
1804; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0
1805; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1806; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1807; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
1808; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1809; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1810; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
1811; X64-SSE42-NEXT: psrld $16, %xmm1
1812; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
1813; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
1814; X64-SSE42-NEXT: psrlw $8, %xmm0
1815; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1816; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
1817; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1818; X64-SSE42-NEXT: retq
1819;
1820; X64-AVX1-LABEL: test_reduce_v64i8:
1821; X64-AVX1: ## BB#0:
1822; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1823; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1824; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
1825; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1826; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
1827; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1828; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1829; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1830; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1831; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1832; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1833; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1834; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1835; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
1836; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1837; X64-AVX1-NEXT: vzeroupper
1838; X64-AVX1-NEXT: retq
1839;
1840; X64-AVX2-LABEL: test_reduce_v64i8:
1841; X64-AVX2: ## BB#0:
1842; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1843; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1844; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1845; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1846; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1847; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1848; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1849; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1850; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1851; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1852; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1853; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
1854; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1855; X64-AVX2-NEXT: vzeroupper
1856; X64-AVX2-NEXT: retq
1857;
1858; X64-AVX512-LABEL: test_reduce_v64i8:
1859; X64-AVX512: ## BB#0:
1860; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1861; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
1862; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1863; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
1864; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1865; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
1866; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1867; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
1868; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1869; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
1870; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1871; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
1872; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
1873; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1874; X64-AVX512-NEXT: vzeroupper
1875; X64-AVX512-NEXT: retq
1876 %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1877 %2 = icmp sgt <64 x i8> %a0, %1
1878 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
1879 %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1880 %5 = icmp sgt <64 x i8> %3, %4
1881 %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
1882 %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1883 %8 = icmp sgt <64 x i8> %6, %7
1884 %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
1885 %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1886 %11 = icmp sgt <64 x i8> %9, %10
1887 %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
1888 %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1889 %14 = icmp sgt <64 x i8> %12, %13
1890 %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
1891 %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1892 %17 = icmp sgt <64 x i8> %15, %16
1893 %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
1894 %19 = extractelement <64 x i8> %18, i32 0
1895 ret i8 %19
1896}