blob: 956a9b1c73292eecbe675188f7165bef640f414b [file] [log] [blame]
Simon Pilgrim879c5b12017-11-05 19:48:24 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
3; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
4; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
5; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
6; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
7; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
9; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
10; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
11
12;
13; 128-bit Vectors
14;
15
16define i64 @test_reduce_v2i64(<2 x i64> %a0) {
17; X86-SSE2-LABEL: test_reduce_v2i64:
18; X86-SSE2: ## BB#0:
19; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
20; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
21; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
22; X86-SSE2-NEXT: pxor %xmm2, %xmm3
23; X86-SSE2-NEXT: pxor %xmm1, %xmm2
24; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
25; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
26; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
27; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
28; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
29; X86-SSE2-NEXT: pand %xmm5, %xmm2
30; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
31; X86-SSE2-NEXT: por %xmm2, %xmm3
32; X86-SSE2-NEXT: pand %xmm3, %xmm0
33; X86-SSE2-NEXT: pandn %xmm1, %xmm3
34; X86-SSE2-NEXT: por %xmm0, %xmm3
35; X86-SSE2-NEXT: movd %xmm3, %eax
36; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
37; X86-SSE2-NEXT: movd %xmm0, %edx
38; X86-SSE2-NEXT: retl
39;
40; X86-SSE42-LABEL: test_reduce_v2i64:
41; X86-SSE42: ## BB#0:
42; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
43; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
44; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
45; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
46; X86-SSE42-NEXT: movd %xmm2, %eax
47; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
48; X86-SSE42-NEXT: retl
49;
50; X86-AVX-LABEL: test_reduce_v2i64:
51; X86-AVX: ## BB#0:
52; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
53; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
54; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
55; X86-AVX-NEXT: vmovd %xmm0, %eax
56; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
57; X86-AVX-NEXT: retl
58;
59; X64-SSE2-LABEL: test_reduce_v2i64:
60; X64-SSE2: ## BB#0:
61; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
62; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
63; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
64; X64-SSE2-NEXT: pxor %xmm2, %xmm3
65; X64-SSE2-NEXT: pxor %xmm1, %xmm2
66; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
67; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
68; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
69; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
70; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
71; X64-SSE2-NEXT: pand %xmm5, %xmm2
72; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
73; X64-SSE2-NEXT: por %xmm2, %xmm3
74; X64-SSE2-NEXT: pand %xmm3, %xmm0
75; X64-SSE2-NEXT: pandn %xmm1, %xmm3
76; X64-SSE2-NEXT: por %xmm0, %xmm3
77; X64-SSE2-NEXT: movq %xmm3, %rax
78; X64-SSE2-NEXT: retq
79;
80; X64-SSE42-LABEL: test_reduce_v2i64:
81; X64-SSE42: ## BB#0:
82; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
83; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
84; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
85; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
86; X64-SSE42-NEXT: movq %xmm2, %rax
87; X64-SSE42-NEXT: retq
88;
89; X64-AVX1-LABEL: test_reduce_v2i64:
90; X64-AVX1: ## BB#0:
91; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
92; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
93; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
94; X64-AVX1-NEXT: vmovq %xmm0, %rax
95; X64-AVX1-NEXT: retq
96;
97; X64-AVX2-LABEL: test_reduce_v2i64:
98; X64-AVX2: ## BB#0:
99; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
100; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
101; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
102; X64-AVX2-NEXT: vmovq %xmm0, %rax
103; X64-AVX2-NEXT: retq
104;
105; X64-AVX512-LABEL: test_reduce_v2i64:
106; X64-AVX512: ## BB#0:
107; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
108; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
109; X64-AVX512-NEXT: vmovq %xmm0, %rax
110; X64-AVX512-NEXT: retq
111 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
112 %2 = icmp sgt <2 x i64> %a0, %1
113 %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
114 %4 = extractelement <2 x i64> %3, i32 0
115 ret i64 %4
116}
117
118define i32 @test_reduce_v4i32(<4 x i32> %a0) {
119; X86-SSE2-LABEL: test_reduce_v4i32:
120; X86-SSE2: ## BB#0:
121; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
122; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
123; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
124; X86-SSE2-NEXT: pand %xmm2, %xmm0
125; X86-SSE2-NEXT: pandn %xmm1, %xmm2
126; X86-SSE2-NEXT: por %xmm0, %xmm2
127; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
128; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
129; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
130; X86-SSE2-NEXT: pand %xmm1, %xmm2
131; X86-SSE2-NEXT: pandn %xmm0, %xmm1
132; X86-SSE2-NEXT: por %xmm2, %xmm1
133; X86-SSE2-NEXT: movd %xmm1, %eax
134; X86-SSE2-NEXT: retl
135;
136; X86-SSE42-LABEL: test_reduce_v4i32:
137; X86-SSE42: ## BB#0:
138; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
139; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
140; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
141; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
142; X86-SSE42-NEXT: movd %xmm0, %eax
143; X86-SSE42-NEXT: retl
144;
145; X86-AVX-LABEL: test_reduce_v4i32:
146; X86-AVX: ## BB#0:
147; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
148; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
149; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
150; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
151; X86-AVX-NEXT: vmovd %xmm0, %eax
152; X86-AVX-NEXT: retl
153;
154; X64-SSE2-LABEL: test_reduce_v4i32:
155; X64-SSE2: ## BB#0:
156; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
157; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
158; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
159; X64-SSE2-NEXT: pand %xmm2, %xmm0
160; X64-SSE2-NEXT: pandn %xmm1, %xmm2
161; X64-SSE2-NEXT: por %xmm0, %xmm2
162; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
163; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
164; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
165; X64-SSE2-NEXT: pand %xmm1, %xmm2
166; X64-SSE2-NEXT: pandn %xmm0, %xmm1
167; X64-SSE2-NEXT: por %xmm2, %xmm1
168; X64-SSE2-NEXT: movd %xmm1, %eax
169; X64-SSE2-NEXT: retq
170;
171; X64-SSE42-LABEL: test_reduce_v4i32:
172; X64-SSE42: ## BB#0:
173; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
174; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
175; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
176; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
177; X64-SSE42-NEXT: movd %xmm0, %eax
178; X64-SSE42-NEXT: retq
179;
180; X64-AVX-LABEL: test_reduce_v4i32:
181; X64-AVX: ## BB#0:
182; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
183; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
184; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
185; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
186; X64-AVX-NEXT: vmovd %xmm0, %eax
187; X64-AVX-NEXT: retq
188 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
189 %2 = icmp sgt <4 x i32> %a0, %1
190 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
191 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
192 %5 = icmp sgt <4 x i32> %3, %4
193 %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
194 %7 = extractelement <4 x i32> %6, i32 0
195 ret i32 %7
196}
197
198define i16 @test_reduce_v8i16(<8 x i16> %a0) {
Simon Pilgrim90accbc2017-11-23 13:50:27 +0000199; X86-SSE2-LABEL: test_reduce_v8i16:
200; X86-SSE2: ## BB#0:
201; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
202; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
203; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
204; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
205; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
206; X86-SSE2-NEXT: psrld $16, %xmm1
207; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
208; X86-SSE2-NEXT: movd %xmm1, %eax
209; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
210; X86-SSE2-NEXT: retl
211;
212; X86-SSE42-LABEL: test_reduce_v8i16:
213; X86-SSE42: ## BB#0:
214; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
215; X86-SSE42-NEXT: pxor %xmm1, %xmm0
216; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
217; X86-SSE42-NEXT: pxor %xmm1, %xmm0
218; X86-SSE42-NEXT: movd %xmm0, %eax
219; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
220; X86-SSE42-NEXT: retl
Simon Pilgrim879c5b12017-11-05 19:48:24 +0000221;
222; X86-AVX-LABEL: test_reduce_v8i16:
223; X86-AVX: ## BB#0:
Simon Pilgrim90accbc2017-11-23 13:50:27 +0000224; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
225; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
226; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
227; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +0000228; X86-AVX-NEXT: vmovd %xmm0, %eax
229; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
230; X86-AVX-NEXT: retl
231;
Simon Pilgrim90accbc2017-11-23 13:50:27 +0000232; X64-SSE2-LABEL: test_reduce_v8i16:
233; X64-SSE2: ## BB#0:
234; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
235; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
236; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
237; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
238; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
239; X64-SSE2-NEXT: psrld $16, %xmm1
240; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
241; X64-SSE2-NEXT: movd %xmm1, %eax
242; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
243; X64-SSE2-NEXT: retq
244;
245; X64-SSE42-LABEL: test_reduce_v8i16:
246; X64-SSE42: ## BB#0:
247; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
248; X64-SSE42-NEXT: pxor %xmm1, %xmm0
249; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
250; X64-SSE42-NEXT: pxor %xmm1, %xmm0
251; X64-SSE42-NEXT: movd %xmm0, %eax
252; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
253; X64-SSE42-NEXT: retq
Simon Pilgrim879c5b12017-11-05 19:48:24 +0000254;
255; X64-AVX-LABEL: test_reduce_v8i16:
256; X64-AVX: ## BB#0:
Simon Pilgrim90accbc2017-11-23 13:50:27 +0000257; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
258; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
259; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
260; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +0000261; X64-AVX-NEXT: vmovd %xmm0, %eax
262; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
263; X64-AVX-NEXT: retq
264 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
265 %2 = icmp sgt <8 x i16> %a0, %1
266 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
267 %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
268 %5 = icmp sgt <8 x i16> %3, %4
269 %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
270 %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
271 %8 = icmp sgt <8 x i16> %6, %7
272 %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
273 %10 = extractelement <8 x i16> %9, i32 0
274 ret i16 %10
275}
276
277define i8 @test_reduce_v16i8(<16 x i8> %a0) {
278; X86-SSE2-LABEL: test_reduce_v16i8:
279; X86-SSE2: ## BB#0:
280; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
281; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
282; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
283; X86-SSE2-NEXT: pand %xmm2, %xmm0
284; X86-SSE2-NEXT: pandn %xmm1, %xmm2
285; X86-SSE2-NEXT: por %xmm0, %xmm2
286; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
287; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
288; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
289; X86-SSE2-NEXT: pand %xmm1, %xmm2
290; X86-SSE2-NEXT: pandn %xmm0, %xmm1
291; X86-SSE2-NEXT: por %xmm2, %xmm1
292; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
293; X86-SSE2-NEXT: psrld $16, %xmm0
294; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
295; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
296; X86-SSE2-NEXT: pand %xmm2, %xmm1
297; X86-SSE2-NEXT: pandn %xmm0, %xmm2
298; X86-SSE2-NEXT: por %xmm1, %xmm2
299; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
300; X86-SSE2-NEXT: psrlw $8, %xmm0
301; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
302; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
303; X86-SSE2-NEXT: pand %xmm1, %xmm2
304; X86-SSE2-NEXT: pandn %xmm0, %xmm1
305; X86-SSE2-NEXT: por %xmm2, %xmm1
306; X86-SSE2-NEXT: movd %xmm1, %eax
307; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
308; X86-SSE2-NEXT: retl
309;
310; X86-SSE42-LABEL: test_reduce_v16i8:
311; X86-SSE42: ## BB#0:
312; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
313; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
314; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
315; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
316; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
317; X86-SSE42-NEXT: psrld $16, %xmm1
318; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
319; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
320; X86-SSE42-NEXT: psrlw $8, %xmm0
321; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
322; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
323; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
324; X86-SSE42-NEXT: retl
325;
326; X86-AVX-LABEL: test_reduce_v16i8:
327; X86-AVX: ## BB#0:
328; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
329; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
330; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
331; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
332; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
333; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
334; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
335; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
336; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
337; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
338; X86-AVX-NEXT: retl
339;
340; X64-SSE2-LABEL: test_reduce_v16i8:
341; X64-SSE2: ## BB#0:
342; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
343; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
344; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
345; X64-SSE2-NEXT: pand %xmm2, %xmm0
346; X64-SSE2-NEXT: pandn %xmm1, %xmm2
347; X64-SSE2-NEXT: por %xmm0, %xmm2
348; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
349; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
350; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
351; X64-SSE2-NEXT: pand %xmm1, %xmm2
352; X64-SSE2-NEXT: pandn %xmm0, %xmm1
353; X64-SSE2-NEXT: por %xmm2, %xmm1
354; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
355; X64-SSE2-NEXT: psrld $16, %xmm0
356; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
357; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
358; X64-SSE2-NEXT: pand %xmm2, %xmm1
359; X64-SSE2-NEXT: pandn %xmm0, %xmm2
360; X64-SSE2-NEXT: por %xmm1, %xmm2
361; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
362; X64-SSE2-NEXT: psrlw $8, %xmm0
363; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
364; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
365; X64-SSE2-NEXT: pand %xmm1, %xmm2
366; X64-SSE2-NEXT: pandn %xmm0, %xmm1
367; X64-SSE2-NEXT: por %xmm2, %xmm1
368; X64-SSE2-NEXT: movd %xmm1, %eax
369; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
370; X64-SSE2-NEXT: retq
371;
372; X64-SSE42-LABEL: test_reduce_v16i8:
373; X64-SSE42: ## BB#0:
374; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
375; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
376; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
377; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
378; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
379; X64-SSE42-NEXT: psrld $16, %xmm1
380; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
381; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
382; X64-SSE42-NEXT: psrlw $8, %xmm0
383; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
384; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
385; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
386; X64-SSE42-NEXT: retq
387;
388; X64-AVX-LABEL: test_reduce_v16i8:
389; X64-AVX: ## BB#0:
390; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
391; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
392; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
393; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
394; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
395; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
396; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
397; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
398; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
399; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
400; X64-AVX-NEXT: retq
401 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
402 %2 = icmp sgt <16 x i8> %a0, %1
403 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
404 %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
405 %5 = icmp sgt <16 x i8> %3, %4
406 %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
407 %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
408 %8 = icmp sgt <16 x i8> %6, %7
409 %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
410 %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
411 %11 = icmp sgt <16 x i8> %9, %10
412 %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
413 %13 = extractelement <16 x i8> %12, i32 0
414 ret i8 %13
415}
416
417;
418; 256-bit Vectors
419;
420
421define i64 @test_reduce_v4i64(<4 x i64> %a0) {
422; X86-SSE2-LABEL: test_reduce_v4i64:
423; X86-SSE2: ## BB#0:
424; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
425; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
426; X86-SSE2-NEXT: pxor %xmm2, %xmm3
427; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
428; X86-SSE2-NEXT: pxor %xmm2, %xmm4
429; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
430; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
431; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
432; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
433; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
434; X86-SSE2-NEXT: pand %xmm6, %xmm3
435; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
436; X86-SSE2-NEXT: por %xmm3, %xmm4
437; X86-SSE2-NEXT: pand %xmm4, %xmm0
438; X86-SSE2-NEXT: pandn %xmm1, %xmm4
439; X86-SSE2-NEXT: por %xmm0, %xmm4
440; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
441; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
442; X86-SSE2-NEXT: pxor %xmm2, %xmm1
443; X86-SSE2-NEXT: pxor %xmm0, %xmm2
444; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
445; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
446; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
447; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
448; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
449; X86-SSE2-NEXT: pand %xmm5, %xmm1
450; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
451; X86-SSE2-NEXT: por %xmm1, %xmm2
452; X86-SSE2-NEXT: pand %xmm2, %xmm4
453; X86-SSE2-NEXT: pandn %xmm0, %xmm2
454; X86-SSE2-NEXT: por %xmm4, %xmm2
455; X86-SSE2-NEXT: movd %xmm2, %eax
456; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
457; X86-SSE2-NEXT: movd %xmm0, %edx
458; X86-SSE2-NEXT: retl
459;
460; X86-SSE42-LABEL: test_reduce_v4i64:
461; X86-SSE42: ## BB#0:
462; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
463; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
464; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
465; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
466; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
467; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
468; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
469; X86-SSE42-NEXT: movd %xmm2, %eax
470; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
471; X86-SSE42-NEXT: retl
472;
473; X86-AVX1-LABEL: test_reduce_v4i64:
474; X86-AVX1: ## BB#0:
475; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
476; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
477; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
478; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
479; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
480; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
481; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
482; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
483; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
484; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
485; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
486; X86-AVX1-NEXT: vmovd %xmm0, %eax
487; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
488; X86-AVX1-NEXT: vzeroupper
489; X86-AVX1-NEXT: retl
490;
491; X86-AVX2-LABEL: test_reduce_v4i64:
492; X86-AVX2: ## BB#0:
493; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
494; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
495; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
496; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
497; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
498; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
499; X86-AVX2-NEXT: vmovd %xmm0, %eax
500; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
501; X86-AVX2-NEXT: vzeroupper
502; X86-AVX2-NEXT: retl
503;
504; X64-SSE2-LABEL: test_reduce_v4i64:
505; X64-SSE2: ## BB#0:
506; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
507; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
508; X64-SSE2-NEXT: pxor %xmm2, %xmm3
509; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
510; X64-SSE2-NEXT: pxor %xmm2, %xmm4
511; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
512; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
513; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
514; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
515; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
516; X64-SSE2-NEXT: pand %xmm6, %xmm3
517; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
518; X64-SSE2-NEXT: por %xmm3, %xmm4
519; X64-SSE2-NEXT: pand %xmm4, %xmm0
520; X64-SSE2-NEXT: pandn %xmm1, %xmm4
521; X64-SSE2-NEXT: por %xmm0, %xmm4
522; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
523; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
524; X64-SSE2-NEXT: pxor %xmm2, %xmm1
525; X64-SSE2-NEXT: pxor %xmm0, %xmm2
526; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
527; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
528; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
529; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
530; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
531; X64-SSE2-NEXT: pand %xmm5, %xmm1
532; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
533; X64-SSE2-NEXT: por %xmm1, %xmm2
534; X64-SSE2-NEXT: pand %xmm2, %xmm4
535; X64-SSE2-NEXT: pandn %xmm0, %xmm2
536; X64-SSE2-NEXT: por %xmm4, %xmm2
537; X64-SSE2-NEXT: movq %xmm2, %rax
538; X64-SSE2-NEXT: retq
539;
540; X64-SSE42-LABEL: test_reduce_v4i64:
541; X64-SSE42: ## BB#0:
542; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
543; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
544; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
545; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
546; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
547; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
548; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
549; X64-SSE42-NEXT: movq %xmm2, %rax
550; X64-SSE42-NEXT: retq
551;
552; X64-AVX1-LABEL: test_reduce_v4i64:
553; X64-AVX1: ## BB#0:
554; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
555; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
556; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
557; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
558; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
559; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
560; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
561; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
562; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
563; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
564; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
565; X64-AVX1-NEXT: vmovq %xmm0, %rax
566; X64-AVX1-NEXT: vzeroupper
567; X64-AVX1-NEXT: retq
568;
569; X64-AVX2-LABEL: test_reduce_v4i64:
570; X64-AVX2: ## BB#0:
571; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
572; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
573; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
574; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
575; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
576; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
577; X64-AVX2-NEXT: vmovq %xmm0, %rax
578; X64-AVX2-NEXT: vzeroupper
579; X64-AVX2-NEXT: retq
580;
581; X64-AVX512-LABEL: test_reduce_v4i64:
582; X64-AVX512: ## BB#0:
583; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
584; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
585; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
586; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
587; X64-AVX512-NEXT: vmovq %xmm0, %rax
588; X64-AVX512-NEXT: vzeroupper
589; X64-AVX512-NEXT: retq
590 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
591 %2 = icmp sgt <4 x i64> %a0, %1
592 %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
593 %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
594 %5 = icmp sgt <4 x i64> %3, %4
595 %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
596 %7 = extractelement <4 x i64> %6, i32 0
597 ret i64 %7
598}
599
600define i32 @test_reduce_v8i32(<8 x i32> %a0) {
601; X86-SSE2-LABEL: test_reduce_v8i32:
602; X86-SSE2: ## BB#0:
603; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
604; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
605; X86-SSE2-NEXT: pand %xmm2, %xmm0
606; X86-SSE2-NEXT: pandn %xmm1, %xmm2
607; X86-SSE2-NEXT: por %xmm0, %xmm2
608; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
609; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
610; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
611; X86-SSE2-NEXT: pand %xmm1, %xmm2
612; X86-SSE2-NEXT: pandn %xmm0, %xmm1
613; X86-SSE2-NEXT: por %xmm2, %xmm1
614; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
615; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
616; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
617; X86-SSE2-NEXT: pand %xmm2, %xmm1
618; X86-SSE2-NEXT: pandn %xmm0, %xmm2
619; X86-SSE2-NEXT: por %xmm1, %xmm2
620; X86-SSE2-NEXT: movd %xmm2, %eax
621; X86-SSE2-NEXT: retl
622;
623; X86-SSE42-LABEL: test_reduce_v8i32:
624; X86-SSE42: ## BB#0:
625; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
626; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
627; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
628; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
629; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
630; X86-SSE42-NEXT: movd %xmm0, %eax
631; X86-SSE42-NEXT: retl
632;
633; X86-AVX1-LABEL: test_reduce_v8i32:
634; X86-AVX1: ## BB#0:
635; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
636; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
637; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
638; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
639; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
640; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
641; X86-AVX1-NEXT: vmovd %xmm0, %eax
642; X86-AVX1-NEXT: vzeroupper
643; X86-AVX1-NEXT: retl
644;
645; X86-AVX2-LABEL: test_reduce_v8i32:
646; X86-AVX2: ## BB#0:
647; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
648; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
649; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
650; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
651; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
652; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
653; X86-AVX2-NEXT: vmovd %xmm0, %eax
654; X86-AVX2-NEXT: vzeroupper
655; X86-AVX2-NEXT: retl
656;
657; X64-SSE2-LABEL: test_reduce_v8i32:
658; X64-SSE2: ## BB#0:
659; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
660; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
661; X64-SSE2-NEXT: pand %xmm2, %xmm0
662; X64-SSE2-NEXT: pandn %xmm1, %xmm2
663; X64-SSE2-NEXT: por %xmm0, %xmm2
664; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
665; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
666; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
667; X64-SSE2-NEXT: pand %xmm1, %xmm2
668; X64-SSE2-NEXT: pandn %xmm0, %xmm1
669; X64-SSE2-NEXT: por %xmm2, %xmm1
670; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
671; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
672; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
673; X64-SSE2-NEXT: pand %xmm2, %xmm1
674; X64-SSE2-NEXT: pandn %xmm0, %xmm2
675; X64-SSE2-NEXT: por %xmm1, %xmm2
676; X64-SSE2-NEXT: movd %xmm2, %eax
677; X64-SSE2-NEXT: retq
678;
679; X64-SSE42-LABEL: test_reduce_v8i32:
680; X64-SSE42: ## BB#0:
681; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
682; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
683; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
684; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
685; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
686; X64-SSE42-NEXT: movd %xmm0, %eax
687; X64-SSE42-NEXT: retq
688;
689; X64-AVX1-LABEL: test_reduce_v8i32:
690; X64-AVX1: ## BB#0:
691; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
692; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
693; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
694; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
695; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
696; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
697; X64-AVX1-NEXT: vmovd %xmm0, %eax
698; X64-AVX1-NEXT: vzeroupper
699; X64-AVX1-NEXT: retq
700;
701; X64-AVX2-LABEL: test_reduce_v8i32:
702; X64-AVX2: ## BB#0:
703; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
704; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
705; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
706; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
707; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
708; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
709; X64-AVX2-NEXT: vmovd %xmm0, %eax
710; X64-AVX2-NEXT: vzeroupper
711; X64-AVX2-NEXT: retq
712;
713; X64-AVX512-LABEL: test_reduce_v8i32:
714; X64-AVX512: ## BB#0:
715; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
716; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
717; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
718; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
719; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
720; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
721; X64-AVX512-NEXT: vmovd %xmm0, %eax
722; X64-AVX512-NEXT: vzeroupper
723; X64-AVX512-NEXT: retq
724 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
725 %2 = icmp sgt <8 x i32> %a0, %1
726 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
727 %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
728 %5 = icmp sgt <8 x i32> %3, %4
729 %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
730 %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
731 %8 = icmp sgt <8 x i32> %6, %7
732 %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
733 %10 = extractelement <8 x i32> %9, i32 0
734 ret i32 %10
735}
736
737define i16 @test_reduce_v16i16(<16 x i16> %a0) {
Simon Pilgrim90accbc2017-11-23 13:50:27 +0000738; X86-SSE2-LABEL: test_reduce_v16i16:
739; X86-SSE2: ## BB#0:
740; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
741; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
742; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
743; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
744; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
745; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
746; X86-SSE2-NEXT: psrld $16, %xmm1
747; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
748; X86-SSE2-NEXT: movd %xmm1, %eax
749; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
750; X86-SSE2-NEXT: retl
751;
752; X86-SSE42-LABEL: test_reduce_v16i16:
753; X86-SSE42: ## BB#0:
754; X86-SSE42-NEXT: pmaxsw %xmm1, %xmm0
755; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
756; X86-SSE42-NEXT: pxor %xmm1, %xmm0
757; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
758; X86-SSE42-NEXT: pxor %xmm1, %xmm0
759; X86-SSE42-NEXT: movd %xmm0, %eax
760; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
761; X86-SSE42-NEXT: retl
Simon Pilgrim879c5b12017-11-05 19:48:24 +0000762;
763; X86-AVX1-LABEL: test_reduce_v16i16:
764; X86-AVX1: ## BB#0:
765; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
766; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
Simon Pilgrim90accbc2017-11-23 13:50:27 +0000767; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
768; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
769; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
770; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +0000771; X86-AVX1-NEXT: vmovd %xmm0, %eax
772; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
773; X86-AVX1-NEXT: vzeroupper
774; X86-AVX1-NEXT: retl
775;
776; X86-AVX2-LABEL: test_reduce_v16i16:
777; X86-AVX2: ## BB#0:
778; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrim90accbc2017-11-23 13:50:27 +0000779; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
780; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
781; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
782; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
783; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +0000784; X86-AVX2-NEXT: vmovd %xmm0, %eax
785; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
786; X86-AVX2-NEXT: vzeroupper
787; X86-AVX2-NEXT: retl
788;
Simon Pilgrim90accbc2017-11-23 13:50:27 +0000789; X64-SSE2-LABEL: test_reduce_v16i16:
790; X64-SSE2: ## BB#0:
791; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
792; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
793; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
794; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
795; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
796; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
797; X64-SSE2-NEXT: psrld $16, %xmm1
798; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
799; X64-SSE2-NEXT: movd %xmm1, %eax
800; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
801; X64-SSE2-NEXT: retq
802;
803; X64-SSE42-LABEL: test_reduce_v16i16:
804; X64-SSE42: ## BB#0:
805; X64-SSE42-NEXT: pmaxsw %xmm1, %xmm0
806; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
807; X64-SSE42-NEXT: pxor %xmm1, %xmm0
808; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
809; X64-SSE42-NEXT: pxor %xmm1, %xmm0
810; X64-SSE42-NEXT: movd %xmm0, %eax
811; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
812; X64-SSE42-NEXT: retq
Simon Pilgrim879c5b12017-11-05 19:48:24 +0000813;
814; X64-AVX1-LABEL: test_reduce_v16i16:
815; X64-AVX1: ## BB#0:
816; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
817; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
Simon Pilgrim90accbc2017-11-23 13:50:27 +0000818; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
819; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
820; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
821; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +0000822; X64-AVX1-NEXT: vmovd %xmm0, %eax
823; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
824; X64-AVX1-NEXT: vzeroupper
825; X64-AVX1-NEXT: retq
826;
827; X64-AVX2-LABEL: test_reduce_v16i16:
828; X64-AVX2: ## BB#0:
829; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrim90accbc2017-11-23 13:50:27 +0000830; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
831; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
832; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
833; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
834; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +0000835; X64-AVX2-NEXT: vmovd %xmm0, %eax
836; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
837; X64-AVX2-NEXT: vzeroupper
838; X64-AVX2-NEXT: retq
839;
840; X64-AVX512-LABEL: test_reduce_v16i16:
841; X64-AVX512: ## BB#0:
842; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrim90accbc2017-11-23 13:50:27 +0000843; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
844; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
845; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
846; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
847; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +0000848; X64-AVX512-NEXT: vmovd %xmm0, %eax
849; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
850; X64-AVX512-NEXT: vzeroupper
851; X64-AVX512-NEXT: retq
852 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
853 %2 = icmp sgt <16 x i16> %a0, %1
854 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
855 %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
856 %5 = icmp sgt <16 x i16> %3, %4
857 %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
858 %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
859 %8 = icmp sgt <16 x i16> %6, %7
860 %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
861 %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
862 %11 = icmp sgt <16 x i16> %9, %10
863 %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
864 %13 = extractelement <16 x i16> %12, i32 0
865 ret i16 %13
866}
867
868define i8 @test_reduce_v32i8(<32 x i8> %a0) {
869; X86-SSE2-LABEL: test_reduce_v32i8:
870; X86-SSE2: ## BB#0:
871; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
872; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
873; X86-SSE2-NEXT: pand %xmm2, %xmm0
874; X86-SSE2-NEXT: pandn %xmm1, %xmm2
875; X86-SSE2-NEXT: por %xmm0, %xmm2
876; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
877; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
878; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
879; X86-SSE2-NEXT: pand %xmm1, %xmm2
880; X86-SSE2-NEXT: pandn %xmm0, %xmm1
881; X86-SSE2-NEXT: por %xmm2, %xmm1
882; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
883; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
884; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
885; X86-SSE2-NEXT: pand %xmm2, %xmm1
886; X86-SSE2-NEXT: pandn %xmm0, %xmm2
887; X86-SSE2-NEXT: por %xmm1, %xmm2
888; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
889; X86-SSE2-NEXT: psrld $16, %xmm0
890; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
891; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
892; X86-SSE2-NEXT: pand %xmm1, %xmm2
893; X86-SSE2-NEXT: pandn %xmm0, %xmm1
894; X86-SSE2-NEXT: por %xmm2, %xmm1
895; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
896; X86-SSE2-NEXT: psrlw $8, %xmm0
897; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
898; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
899; X86-SSE2-NEXT: pand %xmm2, %xmm1
900; X86-SSE2-NEXT: pandn %xmm0, %xmm2
901; X86-SSE2-NEXT: por %xmm1, %xmm2
902; X86-SSE2-NEXT: movd %xmm2, %eax
903; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
904; X86-SSE2-NEXT: retl
905;
906; X86-SSE42-LABEL: test_reduce_v32i8:
907; X86-SSE42: ## BB#0:
908; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
909; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
910; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
911; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
912; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
913; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
914; X86-SSE42-NEXT: psrld $16, %xmm1
915; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
916; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
917; X86-SSE42-NEXT: psrlw $8, %xmm0
918; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
919; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
920; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
921; X86-SSE42-NEXT: retl
922;
923; X86-AVX1-LABEL: test_reduce_v32i8:
924; X86-AVX1: ## BB#0:
925; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
926; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
927; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
928; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
929; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
930; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
931; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
932; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
933; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
934; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
935; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
936; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
937; X86-AVX1-NEXT: vzeroupper
938; X86-AVX1-NEXT: retl
939;
940; X86-AVX2-LABEL: test_reduce_v32i8:
941; X86-AVX2: ## BB#0:
942; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
943; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
944; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
945; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
946; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
947; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
948; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
949; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
950; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
951; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
952; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
953; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
954; X86-AVX2-NEXT: vzeroupper
955; X86-AVX2-NEXT: retl
956;
957; X64-SSE2-LABEL: test_reduce_v32i8:
958; X64-SSE2: ## BB#0:
959; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
960; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
961; X64-SSE2-NEXT: pand %xmm2, %xmm0
962; X64-SSE2-NEXT: pandn %xmm1, %xmm2
963; X64-SSE2-NEXT: por %xmm0, %xmm2
964; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
965; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
966; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
967; X64-SSE2-NEXT: pand %xmm1, %xmm2
968; X64-SSE2-NEXT: pandn %xmm0, %xmm1
969; X64-SSE2-NEXT: por %xmm2, %xmm1
970; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
971; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
972; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
973; X64-SSE2-NEXT: pand %xmm2, %xmm1
974; X64-SSE2-NEXT: pandn %xmm0, %xmm2
975; X64-SSE2-NEXT: por %xmm1, %xmm2
976; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
977; X64-SSE2-NEXT: psrld $16, %xmm0
978; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
979; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
980; X64-SSE2-NEXT: pand %xmm1, %xmm2
981; X64-SSE2-NEXT: pandn %xmm0, %xmm1
982; X64-SSE2-NEXT: por %xmm2, %xmm1
983; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
984; X64-SSE2-NEXT: psrlw $8, %xmm0
985; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
986; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
987; X64-SSE2-NEXT: pand %xmm2, %xmm1
988; X64-SSE2-NEXT: pandn %xmm0, %xmm2
989; X64-SSE2-NEXT: por %xmm1, %xmm2
990; X64-SSE2-NEXT: movd %xmm2, %eax
991; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
992; X64-SSE2-NEXT: retq
993;
994; X64-SSE42-LABEL: test_reduce_v32i8:
995; X64-SSE42: ## BB#0:
996; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
997; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
998; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
999; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1000; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1001; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
1002; X64-SSE42-NEXT: psrld $16, %xmm1
1003; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
1004; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
1005; X64-SSE42-NEXT: psrlw $8, %xmm0
1006; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1007; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
1008; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1009; X64-SSE42-NEXT: retq
1010;
1011; X64-AVX1-LABEL: test_reduce_v32i8:
1012; X64-AVX1: ## BB#0:
1013; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1014; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1015; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1016; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1017; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1018; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1019; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1020; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1021; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1022; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1023; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
1024; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1025; X64-AVX1-NEXT: vzeroupper
1026; X64-AVX1-NEXT: retq
1027;
1028; X64-AVX2-LABEL: test_reduce_v32i8:
1029; X64-AVX2: ## BB#0:
1030; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1031; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1032; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1033; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1034; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1035; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1036; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1037; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1038; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1039; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1040; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
1041; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1042; X64-AVX2-NEXT: vzeroupper
1043; X64-AVX2-NEXT: retq
1044;
1045; X64-AVX512-LABEL: test_reduce_v32i8:
1046; X64-AVX512: ## BB#0:
1047; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1048; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1049; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1050; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1051; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1052; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1053; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1054; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1055; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1056; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1057; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
1058; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1059; X64-AVX512-NEXT: vzeroupper
1060; X64-AVX512-NEXT: retq
1061 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1062 %2 = icmp sgt <32 x i8> %a0, %1
1063 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
1064 %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1065 %5 = icmp sgt <32 x i8> %3, %4
1066 %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
1067 %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1068 %8 = icmp sgt <32 x i8> %6, %7
1069 %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
1070 %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1071 %11 = icmp sgt <32 x i8> %9, %10
1072 %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
1073 %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1074 %14 = icmp sgt <32 x i8> %12, %13
1075 %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
1076 %16 = extractelement <32 x i8> %15, i32 0
1077 ret i8 %16
1078}
1079
1080;
1081; 512-bit Vectors
1082;
1083
1084define i64 @test_reduce_v8i64(<8 x i64> %a0) {
1085; X86-SSE2-LABEL: test_reduce_v8i64:
1086; X86-SSE2: ## BB#0:
1087; X86-SSE2-NEXT: subl $28, %esp
1088; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
1089; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
1090; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill
1091; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
1092; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1093; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
1094; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
1095; X86-SSE2-NEXT: pxor %xmm4, %xmm5
1096; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
1097; X86-SSE2-NEXT: pxor %xmm4, %xmm6
1098; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
1099; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
1100; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
1101; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
1102; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1103; X86-SSE2-NEXT: pand %xmm5, %xmm6
1104; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
1105; X86-SSE2-NEXT: por %xmm6, %xmm5
1106; X86-SSE2-NEXT: movdqa %xmm3, %xmm6
1107; X86-SSE2-NEXT: pxor %xmm4, %xmm6
1108; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
1109; X86-SSE2-NEXT: pxor %xmm4, %xmm7
1110; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
1111; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
1112; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
1113; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
1114; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
1115; X86-SSE2-NEXT: pand %xmm6, %xmm7
1116; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
1117; X86-SSE2-NEXT: por %xmm7, %xmm6
1118; X86-SSE2-NEXT: pand %xmm6, %xmm1
1119; X86-SSE2-NEXT: pandn %xmm3, %xmm6
1120; X86-SSE2-NEXT: por %xmm1, %xmm6
1121; X86-SSE2-NEXT: pand %xmm5, %xmm2
1122; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
1123; X86-SSE2-NEXT: por %xmm2, %xmm5
1124; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
1125; X86-SSE2-NEXT: pxor %xmm4, %xmm0
1126; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
1127; X86-SSE2-NEXT: pxor %xmm4, %xmm1
1128; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1129; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
1130; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
1131; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
1132; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1133; X86-SSE2-NEXT: pand %xmm0, %xmm1
1134; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
1135; X86-SSE2-NEXT: por %xmm1, %xmm0
1136; X86-SSE2-NEXT: pand %xmm0, %xmm6
1137; X86-SSE2-NEXT: pandn %xmm5, %xmm0
1138; X86-SSE2-NEXT: por %xmm6, %xmm0
1139; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1140; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
1141; X86-SSE2-NEXT: pxor %xmm4, %xmm2
1142; X86-SSE2-NEXT: pxor %xmm1, %xmm4
1143; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
1144; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
1145; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
1146; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
1147; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1148; X86-SSE2-NEXT: pand %xmm2, %xmm4
1149; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
1150; X86-SSE2-NEXT: por %xmm4, %xmm2
1151; X86-SSE2-NEXT: pand %xmm2, %xmm0
1152; X86-SSE2-NEXT: pandn %xmm1, %xmm2
1153; X86-SSE2-NEXT: por %xmm0, %xmm2
1154; X86-SSE2-NEXT: movd %xmm2, %eax
1155; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
1156; X86-SSE2-NEXT: movd %xmm0, %edx
1157; X86-SSE2-NEXT: addl $28, %esp
1158; X86-SSE2-NEXT: retl
1159;
1160; X86-SSE42-LABEL: test_reduce_v8i64:
1161; X86-SSE42: ## BB#0:
1162; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
1163; X86-SSE42-NEXT: movdqa %xmm4, %xmm5
1164; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm5
1165; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
1166; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
1167; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
1168; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
1169; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
1170; X86-SSE42-NEXT: movapd %xmm2, %xmm0
1171; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
1172; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
1173; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
1174; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
1175; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
1176; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
1177; X86-SSE42-NEXT: movd %xmm1, %eax
1178; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
1179; X86-SSE42-NEXT: retl
1180;
1181; X86-AVX1-LABEL: test_reduce_v8i64:
1182; X86-AVX1: ## BB#0:
1183; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1184; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1185; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1186; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
1187; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1188; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1189; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1190; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1191; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
1192; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1193; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1194; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1195; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1196; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1197; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
1198; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1199; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1200; X86-AVX1-NEXT: vmovd %xmm0, %eax
1201; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
1202; X86-AVX1-NEXT: vzeroupper
1203; X86-AVX1-NEXT: retl
1204;
1205; X86-AVX2-LABEL: test_reduce_v8i64:
1206; X86-AVX2: ## BB#0:
1207; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
1208; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1209; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1210; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
1211; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1212; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1213; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
1214; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1215; X86-AVX2-NEXT: vmovd %xmm0, %eax
1216; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
1217; X86-AVX2-NEXT: vzeroupper
1218; X86-AVX2-NEXT: retl
1219;
1220; X64-SSE2-LABEL: test_reduce_v8i64:
1221; X64-SSE2: ## BB#0:
1222; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
1223; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
1224; X64-SSE2-NEXT: pxor %xmm4, %xmm5
1225; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
1226; X64-SSE2-NEXT: pxor %xmm4, %xmm6
1227; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
1228; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
1229; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
1230; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
1231; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1232; X64-SSE2-NEXT: pand %xmm8, %xmm6
1233; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
1234; X64-SSE2-NEXT: por %xmm6, %xmm8
1235; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
1236; X64-SSE2-NEXT: pxor %xmm4, %xmm6
1237; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
1238; X64-SSE2-NEXT: pxor %xmm4, %xmm7
1239; X64-SSE2-NEXT: movdqa %xmm7, %xmm5
1240; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
1241; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
1242; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
1243; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
1244; X64-SSE2-NEXT: pand %xmm9, %xmm7
1245; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
1246; X64-SSE2-NEXT: por %xmm7, %xmm6
1247; X64-SSE2-NEXT: pand %xmm6, %xmm0
1248; X64-SSE2-NEXT: pandn %xmm2, %xmm6
1249; X64-SSE2-NEXT: por %xmm0, %xmm6
1250; X64-SSE2-NEXT: pand %xmm8, %xmm1
1251; X64-SSE2-NEXT: pandn %xmm3, %xmm8
1252; X64-SSE2-NEXT: por %xmm1, %xmm8
1253; X64-SSE2-NEXT: movdqa %xmm8, %xmm0
1254; X64-SSE2-NEXT: pxor %xmm4, %xmm0
1255; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
1256; X64-SSE2-NEXT: pxor %xmm4, %xmm1
1257; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
1258; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
1259; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
1260; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
1261; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
1262; X64-SSE2-NEXT: pand %xmm3, %xmm0
1263; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
1264; X64-SSE2-NEXT: por %xmm0, %xmm1
1265; X64-SSE2-NEXT: pand %xmm1, %xmm6
1266; X64-SSE2-NEXT: pandn %xmm8, %xmm1
1267; X64-SSE2-NEXT: por %xmm6, %xmm1
1268; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1269; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
1270; X64-SSE2-NEXT: pxor %xmm4, %xmm2
1271; X64-SSE2-NEXT: pxor %xmm0, %xmm4
1272; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
1273; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
1274; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
1275; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
1276; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1277; X64-SSE2-NEXT: pand %xmm5, %xmm2
1278; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1279; X64-SSE2-NEXT: por %xmm2, %xmm3
1280; X64-SSE2-NEXT: pand %xmm3, %xmm1
1281; X64-SSE2-NEXT: pandn %xmm0, %xmm3
1282; X64-SSE2-NEXT: por %xmm1, %xmm3
1283; X64-SSE2-NEXT: movq %xmm3, %rax
1284; X64-SSE2-NEXT: retq
1285;
1286; X64-SSE42-LABEL: test_reduce_v8i64:
1287; X64-SSE42: ## BB#0:
1288; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
1289; X64-SSE42-NEXT: movdqa %xmm4, %xmm5
1290; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm5
1291; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
1292; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
1293; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
1294; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
1295; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
1296; X64-SSE42-NEXT: movapd %xmm2, %xmm0
1297; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
1298; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
1299; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
1300; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
1301; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
1302; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
1303; X64-SSE42-NEXT: movq %xmm1, %rax
1304; X64-SSE42-NEXT: retq
1305;
1306; X64-AVX1-LABEL: test_reduce_v8i64:
1307; X64-AVX1: ## BB#0:
1308; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1309; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1310; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
1311; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
1312; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1313; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1314; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1315; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1316; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
1317; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1318; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1319; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1320; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1321; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1322; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
1323; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1324; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1325; X64-AVX1-NEXT: vmovq %xmm0, %rax
1326; X64-AVX1-NEXT: vzeroupper
1327; X64-AVX1-NEXT: retq
1328;
1329; X64-AVX2-LABEL: test_reduce_v8i64:
1330; X64-AVX2: ## BB#0:
1331; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
1332; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1333; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1334; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
1335; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1336; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1337; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
1338; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1339; X64-AVX2-NEXT: vmovq %xmm0, %rax
1340; X64-AVX2-NEXT: vzeroupper
1341; X64-AVX2-NEXT: retq
1342;
1343; X64-AVX512-LABEL: test_reduce_v8i64:
1344; X64-AVX512: ## BB#0:
1345; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1346; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
1347; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1348; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
1349; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1350; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
1351; X64-AVX512-NEXT: vmovq %xmm0, %rax
1352; X64-AVX512-NEXT: vzeroupper
1353; X64-AVX512-NEXT: retq
1354 %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1355 %2 = icmp sgt <8 x i64> %a0, %1
1356 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
1357 %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1358 %5 = icmp sgt <8 x i64> %3, %4
1359 %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
1360 %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1361 %8 = icmp sgt <8 x i64> %6, %7
1362 %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
1363 %10 = extractelement <8 x i64> %9, i32 0
1364 ret i64 %10
1365}
1366
1367define i32 @test_reduce_v16i32(<16 x i32> %a0) {
1368; X86-SSE2-LABEL: test_reduce_v16i32:
1369; X86-SSE2: ## BB#0:
1370; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
1371; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
1372; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
1373; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1374; X86-SSE2-NEXT: pand %xmm5, %xmm1
1375; X86-SSE2-NEXT: pandn %xmm3, %xmm5
1376; X86-SSE2-NEXT: por %xmm1, %xmm5
1377; X86-SSE2-NEXT: pand %xmm4, %xmm0
1378; X86-SSE2-NEXT: pandn %xmm2, %xmm4
1379; X86-SSE2-NEXT: por %xmm0, %xmm4
1380; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
1381; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
1382; X86-SSE2-NEXT: pand %xmm0, %xmm4
1383; X86-SSE2-NEXT: pandn %xmm5, %xmm0
1384; X86-SSE2-NEXT: por %xmm4, %xmm0
1385; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1386; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
1387; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
1388; X86-SSE2-NEXT: pand %xmm2, %xmm0
1389; X86-SSE2-NEXT: pandn %xmm1, %xmm2
1390; X86-SSE2-NEXT: por %xmm0, %xmm2
1391; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
1392; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
1393; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
1394; X86-SSE2-NEXT: pand %xmm1, %xmm2
1395; X86-SSE2-NEXT: pandn %xmm0, %xmm1
1396; X86-SSE2-NEXT: por %xmm2, %xmm1
1397; X86-SSE2-NEXT: movd %xmm1, %eax
1398; X86-SSE2-NEXT: retl
1399;
1400; X86-SSE42-LABEL: test_reduce_v16i32:
1401; X86-SSE42: ## BB#0:
1402; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1
1403; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm0
1404; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
1405; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1406; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
1407; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1408; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
1409; X86-SSE42-NEXT: movd %xmm0, %eax
1410; X86-SSE42-NEXT: retl
1411;
1412; X86-AVX1-LABEL: test_reduce_v16i32:
1413; X86-AVX1: ## BB#0:
1414; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1415; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1416; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
1417; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1418; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
1419; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1420; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1421; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1422; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1423; X86-AVX1-NEXT: vmovd %xmm0, %eax
1424; X86-AVX1-NEXT: vzeroupper
1425; X86-AVX1-NEXT: retl
1426;
1427; X86-AVX2-LABEL: test_reduce_v16i32:
1428; X86-AVX2: ## BB#0:
1429; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1430; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1431; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1432; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1433; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1434; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1435; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1436; X86-AVX2-NEXT: vmovd %xmm0, %eax
1437; X86-AVX2-NEXT: vzeroupper
1438; X86-AVX2-NEXT: retl
1439;
1440; X64-SSE2-LABEL: test_reduce_v16i32:
1441; X64-SSE2: ## BB#0:
1442; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
1443; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
1444; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
1445; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1446; X64-SSE2-NEXT: pand %xmm5, %xmm1
1447; X64-SSE2-NEXT: pandn %xmm3, %xmm5
1448; X64-SSE2-NEXT: por %xmm1, %xmm5
1449; X64-SSE2-NEXT: pand %xmm4, %xmm0
1450; X64-SSE2-NEXT: pandn %xmm2, %xmm4
1451; X64-SSE2-NEXT: por %xmm0, %xmm4
1452; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
1453; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
1454; X64-SSE2-NEXT: pand %xmm0, %xmm4
1455; X64-SSE2-NEXT: pandn %xmm5, %xmm0
1456; X64-SSE2-NEXT: por %xmm4, %xmm0
1457; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1458; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
1459; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
1460; X64-SSE2-NEXT: pand %xmm2, %xmm0
1461; X64-SSE2-NEXT: pandn %xmm1, %xmm2
1462; X64-SSE2-NEXT: por %xmm0, %xmm2
1463; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
1464; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
1465; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
1466; X64-SSE2-NEXT: pand %xmm1, %xmm2
1467; X64-SSE2-NEXT: pandn %xmm0, %xmm1
1468; X64-SSE2-NEXT: por %xmm2, %xmm1
1469; X64-SSE2-NEXT: movd %xmm1, %eax
1470; X64-SSE2-NEXT: retq
1471;
1472; X64-SSE42-LABEL: test_reduce_v16i32:
1473; X64-SSE42: ## BB#0:
1474; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1
1475; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm0
1476; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
1477; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1478; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
1479; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1480; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
1481; X64-SSE42-NEXT: movd %xmm0, %eax
1482; X64-SSE42-NEXT: retq
1483;
1484; X64-AVX1-LABEL: test_reduce_v16i32:
1485; X64-AVX1: ## BB#0:
1486; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1487; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1488; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
1489; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1490; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
1491; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1492; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1493; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1494; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1495; X64-AVX1-NEXT: vmovd %xmm0, %eax
1496; X64-AVX1-NEXT: vzeroupper
1497; X64-AVX1-NEXT: retq
1498;
1499; X64-AVX2-LABEL: test_reduce_v16i32:
1500; X64-AVX2: ## BB#0:
1501; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1502; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1503; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1504; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1505; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1506; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1507; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1508; X64-AVX2-NEXT: vmovd %xmm0, %eax
1509; X64-AVX2-NEXT: vzeroupper
1510; X64-AVX2-NEXT: retq
1511;
1512; X64-AVX512-LABEL: test_reduce_v16i32:
1513; X64-AVX512: ## BB#0:
1514; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1515; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
1516; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1517; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
1518; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1519; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
1520; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1521; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
1522; X64-AVX512-NEXT: vmovd %xmm0, %eax
1523; X64-AVX512-NEXT: vzeroupper
1524; X64-AVX512-NEXT: retq
1525 %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1526 %2 = icmp sgt <16 x i32> %a0, %1
1527 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
1528 %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1529 %5 = icmp sgt <16 x i32> %3, %4
1530 %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
1531 %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1532 %8 = icmp sgt <16 x i32> %6, %7
1533 %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
1534 %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1535 %11 = icmp sgt <16 x i32> %9, %10
1536 %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
1537 %13 = extractelement <16 x i32> %12, i32 0
1538 ret i32 %13
1539}
1540
1541define i16 @test_reduce_v32i16(<32 x i16> %a0) {
Simon Pilgrim90accbc2017-11-23 13:50:27 +00001542; X86-SSE2-LABEL: test_reduce_v32i16:
1543; X86-SSE2: ## BB#0:
1544; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm1
1545; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm0
1546; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
1547; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1548; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
1549; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1550; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
1551; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
1552; X86-SSE2-NEXT: psrld $16, %xmm1
1553; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
1554; X86-SSE2-NEXT: movd %xmm1, %eax
1555; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1556; X86-SSE2-NEXT: retl
1557;
1558; X86-SSE42-LABEL: test_reduce_v32i16:
1559; X86-SSE42: ## BB#0:
1560; X86-SSE42-NEXT: pmaxsw %xmm3, %xmm1
1561; X86-SSE42-NEXT: pmaxsw %xmm2, %xmm0
1562; X86-SSE42-NEXT: pmaxsw %xmm1, %xmm0
1563; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
1564; X86-SSE42-NEXT: pxor %xmm1, %xmm0
1565; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
1566; X86-SSE42-NEXT: pxor %xmm1, %xmm0
1567; X86-SSE42-NEXT: movd %xmm0, %eax
1568; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1569; X86-SSE42-NEXT: retl
Simon Pilgrim879c5b12017-11-05 19:48:24 +00001570;
1571; X86-AVX1-LABEL: test_reduce_v32i16:
1572; X86-AVX1: ## BB#0:
1573; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1574; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1575; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
1576; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1577; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
Simon Pilgrim90accbc2017-11-23 13:50:27 +00001578; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
1579; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
1580; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
1581; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +00001582; X86-AVX1-NEXT: vmovd %xmm0, %eax
1583; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1584; X86-AVX1-NEXT: vzeroupper
1585; X86-AVX1-NEXT: retl
1586;
1587; X86-AVX2-LABEL: test_reduce_v32i16:
1588; X86-AVX2: ## BB#0:
1589; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1590; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrim90accbc2017-11-23 13:50:27 +00001591; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1592; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
1593; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
1594; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
1595; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +00001596; X86-AVX2-NEXT: vmovd %xmm0, %eax
1597; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1598; X86-AVX2-NEXT: vzeroupper
1599; X86-AVX2-NEXT: retl
1600;
Simon Pilgrim90accbc2017-11-23 13:50:27 +00001601; X64-SSE2-LABEL: test_reduce_v32i16:
1602; X64-SSE2: ## BB#0:
1603; X64-SSE2-NEXT: pmaxsw %xmm3, %xmm1
1604; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm0
1605; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
1606; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1607; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
1608; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1609; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
1610; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
1611; X64-SSE2-NEXT: psrld $16, %xmm1
1612; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
1613; X64-SSE2-NEXT: movd %xmm1, %eax
1614; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1615; X64-SSE2-NEXT: retq
1616;
1617; X64-SSE42-LABEL: test_reduce_v32i16:
1618; X64-SSE42: ## BB#0:
1619; X64-SSE42-NEXT: pmaxsw %xmm3, %xmm1
1620; X64-SSE42-NEXT: pmaxsw %xmm2, %xmm0
1621; X64-SSE42-NEXT: pmaxsw %xmm1, %xmm0
1622; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
1623; X64-SSE42-NEXT: pxor %xmm1, %xmm0
1624; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
1625; X64-SSE42-NEXT: pxor %xmm1, %xmm0
1626; X64-SSE42-NEXT: movd %xmm0, %eax
1627; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1628; X64-SSE42-NEXT: retq
Simon Pilgrim879c5b12017-11-05 19:48:24 +00001629;
1630; X64-AVX1-LABEL: test_reduce_v32i16:
1631; X64-AVX1: ## BB#0:
1632; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1633; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1634; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
1635; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1636; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
Simon Pilgrim90accbc2017-11-23 13:50:27 +00001637; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
1638; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
1639; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
1640; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +00001641; X64-AVX1-NEXT: vmovd %xmm0, %eax
1642; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1643; X64-AVX1-NEXT: vzeroupper
1644; X64-AVX1-NEXT: retq
1645;
1646; X64-AVX2-LABEL: test_reduce_v32i16:
1647; X64-AVX2: ## BB#0:
1648; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1649; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrim90accbc2017-11-23 13:50:27 +00001650; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1651; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
1652; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
1653; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
1654; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +00001655; X64-AVX2-NEXT: vmovd %xmm0, %eax
1656; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1657; X64-AVX2-NEXT: vzeroupper
1658; X64-AVX2-NEXT: retq
1659;
1660; X64-AVX512-LABEL: test_reduce_v32i16:
1661; X64-AVX512: ## BB#0:
1662; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
Simon Pilgrim90accbc2017-11-23 13:50:27 +00001663; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +00001664; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrim90accbc2017-11-23 13:50:27 +00001665; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
1666; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
1667; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
1668; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
1669; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
Simon Pilgrim879c5b12017-11-05 19:48:24 +00001670; X64-AVX512-NEXT: vmovd %xmm0, %eax
1671; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
1672; X64-AVX512-NEXT: vzeroupper
1673; X64-AVX512-NEXT: retq
1674 %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1675 %2 = icmp sgt <32 x i16> %a0, %1
1676 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
1677 %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1678 %5 = icmp sgt <32 x i16> %3, %4
1679 %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
1680 %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1681 %8 = icmp sgt <32 x i16> %6, %7
1682 %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
1683 %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1684 %11 = icmp sgt <32 x i16> %9, %10
1685 %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
1686 %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1687 %14 = icmp sgt <32 x i16> %12, %13
1688 %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
1689 %16 = extractelement <32 x i16> %15, i32 0
1690 ret i16 %16
1691}
1692
1693define i8 @test_reduce_v64i8(<64 x i8> %a0) {
1694; X86-SSE2-LABEL: test_reduce_v64i8:
1695; X86-SSE2: ## BB#0:
1696; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
1697; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
1698; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
1699; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm5
1700; X86-SSE2-NEXT: pand %xmm5, %xmm1
1701; X86-SSE2-NEXT: pandn %xmm3, %xmm5
1702; X86-SSE2-NEXT: por %xmm1, %xmm5
1703; X86-SSE2-NEXT: pand %xmm4, %xmm0
1704; X86-SSE2-NEXT: pandn %xmm2, %xmm4
1705; X86-SSE2-NEXT: por %xmm0, %xmm4
1706; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
1707; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
1708; X86-SSE2-NEXT: pand %xmm0, %xmm4
1709; X86-SSE2-NEXT: pandn %xmm5, %xmm0
1710; X86-SSE2-NEXT: por %xmm4, %xmm0
1711; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1712; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
1713; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
1714; X86-SSE2-NEXT: pand %xmm2, %xmm0
1715; X86-SSE2-NEXT: pandn %xmm1, %xmm2
1716; X86-SSE2-NEXT: por %xmm0, %xmm2
1717; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
1718; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
1719; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
1720; X86-SSE2-NEXT: pand %xmm1, %xmm2
1721; X86-SSE2-NEXT: pandn %xmm0, %xmm1
1722; X86-SSE2-NEXT: por %xmm2, %xmm1
1723; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
1724; X86-SSE2-NEXT: psrld $16, %xmm0
1725; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1726; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
1727; X86-SSE2-NEXT: pand %xmm2, %xmm1
1728; X86-SSE2-NEXT: pandn %xmm0, %xmm2
1729; X86-SSE2-NEXT: por %xmm1, %xmm2
1730; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
1731; X86-SSE2-NEXT: psrlw $8, %xmm0
1732; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
1733; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
1734; X86-SSE2-NEXT: pand %xmm1, %xmm2
1735; X86-SSE2-NEXT: pandn %xmm0, %xmm1
1736; X86-SSE2-NEXT: por %xmm2, %xmm1
1737; X86-SSE2-NEXT: movd %xmm1, %eax
1738; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1739; X86-SSE2-NEXT: retl
1740;
1741; X86-SSE42-LABEL: test_reduce_v64i8:
1742; X86-SSE42: ## BB#0:
1743; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1
1744; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0
1745; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1746; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1747; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
1748; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1749; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1750; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
1751; X86-SSE42-NEXT: psrld $16, %xmm1
1752; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
1753; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
1754; X86-SSE42-NEXT: psrlw $8, %xmm0
1755; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1756; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
1757; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1758; X86-SSE42-NEXT: retl
1759;
1760; X86-AVX1-LABEL: test_reduce_v64i8:
1761; X86-AVX1: ## BB#0:
1762; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1763; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1764; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
1765; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1766; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
1767; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1768; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1769; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1770; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1771; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1772; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1773; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1774; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1775; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
1776; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1777; X86-AVX1-NEXT: vzeroupper
1778; X86-AVX1-NEXT: retl
1779;
1780; X86-AVX2-LABEL: test_reduce_v64i8:
1781; X86-AVX2: ## BB#0:
1782; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1783; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1784; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1785; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1786; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1787; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1788; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1789; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1790; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1791; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1792; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1793; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
1794; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1795; X86-AVX2-NEXT: vzeroupper
1796; X86-AVX2-NEXT: retl
1797;
1798; X64-SSE2-LABEL: test_reduce_v64i8:
1799; X64-SSE2: ## BB#0:
1800; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
1801; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
1802; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
1803; X64-SSE2-NEXT: pcmpgtb %xmm3, %xmm5
1804; X64-SSE2-NEXT: pand %xmm5, %xmm1
1805; X64-SSE2-NEXT: pandn %xmm3, %xmm5
1806; X64-SSE2-NEXT: por %xmm1, %xmm5
1807; X64-SSE2-NEXT: pand %xmm4, %xmm0
1808; X64-SSE2-NEXT: pandn %xmm2, %xmm4
1809; X64-SSE2-NEXT: por %xmm0, %xmm4
1810; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
1811; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
1812; X64-SSE2-NEXT: pand %xmm0, %xmm4
1813; X64-SSE2-NEXT: pandn %xmm5, %xmm0
1814; X64-SSE2-NEXT: por %xmm4, %xmm0
1815; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1816; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
1817; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
1818; X64-SSE2-NEXT: pand %xmm2, %xmm0
1819; X64-SSE2-NEXT: pandn %xmm1, %xmm2
1820; X64-SSE2-NEXT: por %xmm0, %xmm2
1821; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
1822; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
1823; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
1824; X64-SSE2-NEXT: pand %xmm1, %xmm2
1825; X64-SSE2-NEXT: pandn %xmm0, %xmm1
1826; X64-SSE2-NEXT: por %xmm2, %xmm1
1827; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
1828; X64-SSE2-NEXT: psrld $16, %xmm0
1829; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
1830; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
1831; X64-SSE2-NEXT: pand %xmm2, %xmm1
1832; X64-SSE2-NEXT: pandn %xmm0, %xmm2
1833; X64-SSE2-NEXT: por %xmm1, %xmm2
1834; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
1835; X64-SSE2-NEXT: psrlw $8, %xmm0
1836; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
1837; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
1838; X64-SSE2-NEXT: pand %xmm1, %xmm2
1839; X64-SSE2-NEXT: pandn %xmm0, %xmm1
1840; X64-SSE2-NEXT: por %xmm2, %xmm1
1841; X64-SSE2-NEXT: movd %xmm1, %eax
1842; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1843; X64-SSE2-NEXT: retq
1844;
1845; X64-SSE42-LABEL: test_reduce_v64i8:
1846; X64-SSE42: ## BB#0:
1847; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1
1848; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0
1849; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1850; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1851; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
1852; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1853; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1854; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
1855; X64-SSE42-NEXT: psrld $16, %xmm1
1856; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
1857; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
1858; X64-SSE42-NEXT: psrlw $8, %xmm0
1859; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
1860; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
1861; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1862; X64-SSE42-NEXT: retq
1863;
1864; X64-AVX1-LABEL: test_reduce_v64i8:
1865; X64-AVX1: ## BB#0:
1866; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1867; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1868; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
1869; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1870; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
1871; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1872; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1873; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1874; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1875; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1876; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1877; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1878; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
1879; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
1880; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1881; X64-AVX1-NEXT: vzeroupper
1882; X64-AVX1-NEXT: retq
1883;
1884; X64-AVX2-LABEL: test_reduce_v64i8:
1885; X64-AVX2: ## BB#0:
1886; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1887; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1888; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1889; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1890; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1891; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1892; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1893; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1894; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1895; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1896; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1897; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
1898; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1899; X64-AVX2-NEXT: vzeroupper
1900; X64-AVX2-NEXT: retq
1901;
1902; X64-AVX512-LABEL: test_reduce_v64i8:
1903; X64-AVX512: ## BB#0:
1904; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1905; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
1906; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1907; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
1908; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1909; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
1910; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1911; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
1912; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1913; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
1914; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1915; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
1916; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
1917; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
1918; X64-AVX512-NEXT: vzeroupper
1919; X64-AVX512-NEXT: retq
1920 %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1921 %2 = icmp sgt <64 x i8> %a0, %1
1922 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
1923 %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1924 %5 = icmp sgt <64 x i8> %3, %4
1925 %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
1926 %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1927 %8 = icmp sgt <64 x i8> %6, %7
1928 %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
1929 %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1930 %11 = icmp sgt <64 x i8> %9, %10
1931 %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
1932 %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1933 %14 = icmp sgt <64 x i8> %12, %13
1934 %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
1935 %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1936 %17 = icmp sgt <64 x i8> %15, %16
1937 %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
1938 %19 = extractelement <64 x i8> %18, i32 0
1939 ret i8 %19
1940}