blob: d169f0cb138ddf0fec02f7e5fcd93f1f9d49dcc1 [file] [log] [blame]
Mitch Phillips2f908c12019-08-06 23:00:30 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
7; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
8
9;
10; vXi64
11;
12
13define i64 @test_v2i64(<2 x i64> %a0) {
14; SSE-LABEL: test_v2i64:
15; SSE: # %bb.0:
16; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
17; SSE-NEXT: pand %xmm0, %xmm1
18; SSE-NEXT: movq %xmm1, %rax
19; SSE-NEXT: retq
20;
21; AVX-LABEL: test_v2i64:
22; AVX: # %bb.0:
23; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
24; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
25; AVX-NEXT: vmovq %xmm0, %rax
26; AVX-NEXT: retq
27 %1 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %a0)
28 ret i64 %1
29}
30
31define i64 @test_v4i64(<4 x i64> %a0) {
32; SSE-LABEL: test_v4i64:
33; SSE: # %bb.0:
34; SSE-NEXT: pand %xmm1, %xmm0
35; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
36; SSE-NEXT: pand %xmm0, %xmm1
37; SSE-NEXT: movq %xmm1, %rax
38; SSE-NEXT: retq
39;
40; AVX1-LABEL: test_v4i64:
41; AVX1: # %bb.0:
42; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
43; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
44; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
45; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
46; AVX1-NEXT: vmovq %xmm0, %rax
47; AVX1-NEXT: vzeroupper
48; AVX1-NEXT: retq
49;
50; AVX2-LABEL: test_v4i64:
51; AVX2: # %bb.0:
52; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
53; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
54; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
55; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
56; AVX2-NEXT: vmovq %xmm0, %rax
57; AVX2-NEXT: vzeroupper
58; AVX2-NEXT: retq
59;
60; AVX512-LABEL: test_v4i64:
61; AVX512: # %bb.0:
62; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
63; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
64; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
65; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
66; AVX512-NEXT: vmovq %xmm0, %rax
67; AVX512-NEXT: vzeroupper
68; AVX512-NEXT: retq
69 %1 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %a0)
70 ret i64 %1
71}
72
73define i64 @test_v8i64(<8 x i64> %a0) {
74; SSE-LABEL: test_v8i64:
75; SSE: # %bb.0:
76; SSE-NEXT: pand %xmm3, %xmm1
77; SSE-NEXT: pand %xmm2, %xmm1
78; SSE-NEXT: pand %xmm0, %xmm1
79; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
80; SSE-NEXT: pand %xmm1, %xmm0
81; SSE-NEXT: movq %xmm0, %rax
82; SSE-NEXT: retq
83;
84; AVX1-LABEL: test_v8i64:
85; AVX1: # %bb.0:
86; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
87; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
88; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
89; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
90; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
91; AVX1-NEXT: vmovq %xmm0, %rax
92; AVX1-NEXT: vzeroupper
93; AVX1-NEXT: retq
94;
95; AVX2-LABEL: test_v8i64:
96; AVX2: # %bb.0:
97; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
98; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
99; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
100; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
101; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
102; AVX2-NEXT: vmovq %xmm0, %rax
103; AVX2-NEXT: vzeroupper
104; AVX2-NEXT: retq
105;
106; AVX512-LABEL: test_v8i64:
107; AVX512: # %bb.0:
108; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
109; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
110; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
111; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
112; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
113; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
114; AVX512-NEXT: vmovq %xmm0, %rax
115; AVX512-NEXT: vzeroupper
116; AVX512-NEXT: retq
117 %1 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %a0)
118 ret i64 %1
119}
120
121define i64 @test_v16i64(<16 x i64> %a0) {
122; SSE-LABEL: test_v16i64:
123; SSE: # %bb.0:
124; SSE-NEXT: pand %xmm6, %xmm2
125; SSE-NEXT: pand %xmm7, %xmm3
126; SSE-NEXT: pand %xmm5, %xmm3
127; SSE-NEXT: pand %xmm1, %xmm3
128; SSE-NEXT: pand %xmm4, %xmm2
129; SSE-NEXT: pand %xmm3, %xmm2
130; SSE-NEXT: pand %xmm0, %xmm2
131; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
132; SSE-NEXT: pand %xmm2, %xmm0
133; SSE-NEXT: movq %xmm0, %rax
134; SSE-NEXT: retq
135;
136; AVX1-LABEL: test_v16i64:
137; AVX1: # %bb.0:
138; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
139; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
140; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
141; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
142; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
143; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
144; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
145; AVX1-NEXT: vmovq %xmm0, %rax
146; AVX1-NEXT: vzeroupper
147; AVX1-NEXT: retq
148;
149; AVX2-LABEL: test_v16i64:
150; AVX2: # %bb.0:
151; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
152; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
153; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
154; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
155; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
156; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
157; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
158; AVX2-NEXT: vmovq %xmm0, %rax
159; AVX2-NEXT: vzeroupper
160; AVX2-NEXT: retq
161;
162; AVX512-LABEL: test_v16i64:
163; AVX512: # %bb.0:
164; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
165; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
166; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
167; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
168; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
169; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
170; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
171; AVX512-NEXT: vmovq %xmm0, %rax
172; AVX512-NEXT: vzeroupper
173; AVX512-NEXT: retq
174 %1 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> %a0)
175 ret i64 %1
176}
177
178;
179; vXi32
180;
181
182define i32 @test_v2i32(<2 x i32> %a0) {
183; SSE-LABEL: test_v2i32:
184; SSE: # %bb.0:
185; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
186; SSE-NEXT: pand %xmm0, %xmm1
187; SSE-NEXT: movd %xmm1, %eax
188; SSE-NEXT: retq
189;
190; AVX-LABEL: test_v2i32:
191; AVX: # %bb.0:
192; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
193; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
194; AVX-NEXT: vmovd %xmm0, %eax
195; AVX-NEXT: retq
196 %1 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %a0)
197 ret i32 %1
198}
199
200define i32 @test_v4i32(<4 x i32> %a0) {
201; SSE-LABEL: test_v4i32:
202; SSE: # %bb.0:
203; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
204; SSE-NEXT: pand %xmm0, %xmm1
205; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
206; SSE-NEXT: pand %xmm1, %xmm0
207; SSE-NEXT: movd %xmm0, %eax
208; SSE-NEXT: retq
209;
210; AVX-LABEL: test_v4i32:
211; AVX: # %bb.0:
212; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
213; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
214; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
215; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
216; AVX-NEXT: vmovd %xmm0, %eax
217; AVX-NEXT: retq
218 %1 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a0)
219 ret i32 %1
220}
221
222define i32 @test_v8i32(<8 x i32> %a0) {
223; SSE-LABEL: test_v8i32:
224; SSE: # %bb.0:
225; SSE-NEXT: pand %xmm1, %xmm0
226; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
227; SSE-NEXT: pand %xmm0, %xmm1
228; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
229; SSE-NEXT: pand %xmm1, %xmm0
230; SSE-NEXT: movd %xmm0, %eax
231; SSE-NEXT: retq
232;
233; AVX1-LABEL: test_v8i32:
234; AVX1: # %bb.0:
235; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
236; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
237; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
238; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
239; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
240; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
241; AVX1-NEXT: vmovd %xmm0, %eax
242; AVX1-NEXT: vzeroupper
243; AVX1-NEXT: retq
244;
245; AVX2-LABEL: test_v8i32:
246; AVX2: # %bb.0:
247; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
248; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
249; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
250; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
251; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
252; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
253; AVX2-NEXT: vmovd %xmm0, %eax
254; AVX2-NEXT: vzeroupper
255; AVX2-NEXT: retq
256;
257; AVX512-LABEL: test_v8i32:
258; AVX512: # %bb.0:
259; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
260; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
261; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
262; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
263; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
264; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
265; AVX512-NEXT: vmovd %xmm0, %eax
266; AVX512-NEXT: vzeroupper
267; AVX512-NEXT: retq
268 %1 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a0)
269 ret i32 %1
270}
271
272define i32 @test_v16i32(<16 x i32> %a0) {
273; SSE-LABEL: test_v16i32:
274; SSE: # %bb.0:
275; SSE-NEXT: pand %xmm3, %xmm1
276; SSE-NEXT: pand %xmm2, %xmm1
277; SSE-NEXT: pand %xmm0, %xmm1
278; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
279; SSE-NEXT: pand %xmm1, %xmm0
280; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
281; SSE-NEXT: pand %xmm0, %xmm1
282; SSE-NEXT: movd %xmm1, %eax
283; SSE-NEXT: retq
284;
285; AVX1-LABEL: test_v16i32:
286; AVX1: # %bb.0:
287; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
288; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
289; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
290; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
291; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
292; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
293; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
294; AVX1-NEXT: vmovd %xmm0, %eax
295; AVX1-NEXT: vzeroupper
296; AVX1-NEXT: retq
297;
298; AVX2-LABEL: test_v16i32:
299; AVX2: # %bb.0:
300; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
301; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
302; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
303; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
304; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
305; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
306; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
307; AVX2-NEXT: vmovd %xmm0, %eax
308; AVX2-NEXT: vzeroupper
309; AVX2-NEXT: retq
310;
311; AVX512-LABEL: test_v16i32:
312; AVX512: # %bb.0:
313; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
314; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
315; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
316; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
317; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
318; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
319; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
320; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
321; AVX512-NEXT: vmovd %xmm0, %eax
322; AVX512-NEXT: vzeroupper
323; AVX512-NEXT: retq
324 %1 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a0)
325 ret i32 %1
326}
327
328define i32 @test_v32i32(<32 x i32> %a0) {
329; SSE-LABEL: test_v32i32:
330; SSE: # %bb.0:
331; SSE-NEXT: pand %xmm6, %xmm2
332; SSE-NEXT: pand %xmm7, %xmm3
333; SSE-NEXT: pand %xmm5, %xmm3
334; SSE-NEXT: pand %xmm1, %xmm3
335; SSE-NEXT: pand %xmm4, %xmm2
336; SSE-NEXT: pand %xmm3, %xmm2
337; SSE-NEXT: pand %xmm0, %xmm2
338; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
339; SSE-NEXT: pand %xmm2, %xmm0
340; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
341; SSE-NEXT: pand %xmm0, %xmm1
342; SSE-NEXT: movd %xmm1, %eax
343; SSE-NEXT: retq
344;
345; AVX1-LABEL: test_v32i32:
346; AVX1: # %bb.0:
347; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
348; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
349; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
350; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
351; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
352; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
353; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
354; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
355; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
356; AVX1-NEXT: vmovd %xmm0, %eax
357; AVX1-NEXT: vzeroupper
358; AVX1-NEXT: retq
359;
360; AVX2-LABEL: test_v32i32:
361; AVX2: # %bb.0:
362; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
363; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
364; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
365; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
366; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
367; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
368; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
369; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
370; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
371; AVX2-NEXT: vmovd %xmm0, %eax
372; AVX2-NEXT: vzeroupper
373; AVX2-NEXT: retq
374;
375; AVX512-LABEL: test_v32i32:
376; AVX512: # %bb.0:
377; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
378; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
379; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
380; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
381; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
382; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
383; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
384; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
385; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
386; AVX512-NEXT: vmovd %xmm0, %eax
387; AVX512-NEXT: vzeroupper
388; AVX512-NEXT: retq
389 %1 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> %a0)
390 ret i32 %1
391}
392
393;
394; vXi16
395;
396
397define i16 @test_v2i16(<2 x i16> %a0) {
398; SSE-LABEL: test_v2i16:
399; SSE: # %bb.0:
400; SSE-NEXT: movdqa %xmm0, %xmm1
401; SSE-NEXT: psrld $16, %xmm1
402; SSE-NEXT: pand %xmm0, %xmm1
403; SSE-NEXT: movd %xmm1, %eax
404; SSE-NEXT: # kill: def $ax killed $ax killed $eax
405; SSE-NEXT: retq
406;
407; AVX-LABEL: test_v2i16:
408; AVX: # %bb.0:
409; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
410; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
411; AVX-NEXT: vmovd %xmm0, %eax
412; AVX-NEXT: # kill: def $ax killed $ax killed $eax
413; AVX-NEXT: retq
414 %1 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> %a0)
415 ret i16 %1
416}
417
418define i16 @test_v4i16(<4 x i16> %a0) {
419; SSE-LABEL: test_v4i16:
420; SSE: # %bb.0:
421; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
422; SSE-NEXT: pand %xmm0, %xmm1
423; SSE-NEXT: movdqa %xmm1, %xmm0
424; SSE-NEXT: psrld $16, %xmm0
425; SSE-NEXT: pand %xmm1, %xmm0
426; SSE-NEXT: movd %xmm0, %eax
427; SSE-NEXT: # kill: def $ax killed $ax killed $eax
428; SSE-NEXT: retq
429;
430; AVX-LABEL: test_v4i16:
431; AVX: # %bb.0:
432; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
433; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
434; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
435; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
436; AVX-NEXT: vmovd %xmm0, %eax
437; AVX-NEXT: # kill: def $ax killed $ax killed $eax
438; AVX-NEXT: retq
439 %1 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %a0)
440 ret i16 %1
441}
442
443define i16 @test_v8i16(<8 x i16> %a0) {
444; SSE-LABEL: test_v8i16:
445; SSE: # %bb.0:
446; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
447; SSE-NEXT: pand %xmm0, %xmm1
448; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
449; SSE-NEXT: pand %xmm1, %xmm0
450; SSE-NEXT: movdqa %xmm0, %xmm1
451; SSE-NEXT: psrld $16, %xmm1
452; SSE-NEXT: pand %xmm0, %xmm1
453; SSE-NEXT: movd %xmm1, %eax
454; SSE-NEXT: # kill: def $ax killed $ax killed $eax
455; SSE-NEXT: retq
456;
457; AVX-LABEL: test_v8i16:
458; AVX: # %bb.0:
459; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
460; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
461; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
462; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
463; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
464; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
465; AVX-NEXT: vmovd %xmm0, %eax
466; AVX-NEXT: # kill: def $ax killed $ax killed $eax
467; AVX-NEXT: retq
468 %1 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %a0)
469 ret i16 %1
470}
471
472define i16 @test_v16i16(<16 x i16> %a0) {
473; SSE-LABEL: test_v16i16:
474; SSE: # %bb.0:
475; SSE-NEXT: pand %xmm1, %xmm0
476; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
477; SSE-NEXT: pand %xmm0, %xmm1
478; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
479; SSE-NEXT: pand %xmm1, %xmm0
480; SSE-NEXT: movdqa %xmm0, %xmm1
481; SSE-NEXT: psrld $16, %xmm1
482; SSE-NEXT: pand %xmm0, %xmm1
483; SSE-NEXT: movd %xmm1, %eax
484; SSE-NEXT: # kill: def $ax killed $ax killed $eax
485; SSE-NEXT: retq
486;
487; AVX1-LABEL: test_v16i16:
488; AVX1: # %bb.0:
489; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
490; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
491; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
492; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
493; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
494; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
495; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
496; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
497; AVX1-NEXT: vmovd %xmm0, %eax
498; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
499; AVX1-NEXT: vzeroupper
500; AVX1-NEXT: retq
501;
502; AVX2-LABEL: test_v16i16:
503; AVX2: # %bb.0:
504; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
505; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
506; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
507; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
508; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
509; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
510; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
511; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
512; AVX2-NEXT: vmovd %xmm0, %eax
513; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
514; AVX2-NEXT: vzeroupper
515; AVX2-NEXT: retq
516;
517; AVX512-LABEL: test_v16i16:
518; AVX512: # %bb.0:
519; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
520; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
521; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
522; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
523; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
524; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
525; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
526; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
527; AVX512-NEXT: vmovd %xmm0, %eax
528; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
529; AVX512-NEXT: vzeroupper
530; AVX512-NEXT: retq
531 %1 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %a0)
532 ret i16 %1
533}
534
535define i16 @test_v32i16(<32 x i16> %a0) {
536; SSE-LABEL: test_v32i16:
537; SSE: # %bb.0:
538; SSE-NEXT: pand %xmm3, %xmm1
539; SSE-NEXT: pand %xmm2, %xmm1
540; SSE-NEXT: pand %xmm0, %xmm1
541; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
542; SSE-NEXT: pand %xmm1, %xmm0
543; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
544; SSE-NEXT: pand %xmm0, %xmm1
545; SSE-NEXT: movdqa %xmm1, %xmm0
546; SSE-NEXT: psrld $16, %xmm0
547; SSE-NEXT: pand %xmm1, %xmm0
548; SSE-NEXT: movd %xmm0, %eax
549; SSE-NEXT: # kill: def $ax killed $ax killed $eax
550; SSE-NEXT: retq
551;
552; AVX1-LABEL: test_v32i16:
553; AVX1: # %bb.0:
554; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
555; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
556; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
557; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
558; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
559; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
560; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
561; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
562; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
563; AVX1-NEXT: vmovd %xmm0, %eax
564; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
565; AVX1-NEXT: vzeroupper
566; AVX1-NEXT: retq
567;
568; AVX2-LABEL: test_v32i16:
569; AVX2: # %bb.0:
570; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
571; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
572; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
573; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
574; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
575; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
576; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
577; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
578; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
579; AVX2-NEXT: vmovd %xmm0, %eax
580; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
581; AVX2-NEXT: vzeroupper
582; AVX2-NEXT: retq
583;
584; AVX512-LABEL: test_v32i16:
585; AVX512: # %bb.0:
586; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
587; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
588; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
589; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
590; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
591; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
592; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
593; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
594; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
595; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
596; AVX512-NEXT: vmovd %xmm0, %eax
597; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
598; AVX512-NEXT: vzeroupper
599; AVX512-NEXT: retq
600 %1 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> %a0)
601 ret i16 %1
602}
603
604define i16 @test_v64i16(<64 x i16> %a0) {
605; SSE-LABEL: test_v64i16:
606; SSE: # %bb.0:
607; SSE-NEXT: pand %xmm6, %xmm2
608; SSE-NEXT: pand %xmm7, %xmm3
609; SSE-NEXT: pand %xmm5, %xmm3
610; SSE-NEXT: pand %xmm1, %xmm3
611; SSE-NEXT: pand %xmm4, %xmm2
612; SSE-NEXT: pand %xmm3, %xmm2
613; SSE-NEXT: pand %xmm0, %xmm2
614; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
615; SSE-NEXT: pand %xmm2, %xmm0
616; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
617; SSE-NEXT: pand %xmm0, %xmm1
618; SSE-NEXT: movdqa %xmm1, %xmm0
619; SSE-NEXT: psrld $16, %xmm0
620; SSE-NEXT: pand %xmm1, %xmm0
621; SSE-NEXT: movd %xmm0, %eax
622; SSE-NEXT: # kill: def $ax killed $ax killed $eax
623; SSE-NEXT: retq
624;
625; AVX1-LABEL: test_v64i16:
626; AVX1: # %bb.0:
627; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
628; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
629; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
630; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
631; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
632; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
633; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
634; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
635; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
636; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
637; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
638; AVX1-NEXT: vmovd %xmm0, %eax
639; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
640; AVX1-NEXT: vzeroupper
641; AVX1-NEXT: retq
642;
643; AVX2-LABEL: test_v64i16:
644; AVX2: # %bb.0:
645; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
646; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
647; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
648; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
649; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
650; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
651; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
652; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
653; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
654; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
655; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
656; AVX2-NEXT: vmovd %xmm0, %eax
657; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
658; AVX2-NEXT: vzeroupper
659; AVX2-NEXT: retq
660;
661; AVX512-LABEL: test_v64i16:
662; AVX512: # %bb.0:
663; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
664; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
665; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
666; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
667; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
668; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
669; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
670; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
671; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
672; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
673; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
674; AVX512-NEXT: vmovd %xmm0, %eax
675; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
676; AVX512-NEXT: vzeroupper
677; AVX512-NEXT: retq
678 %1 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> %a0)
679 ret i16 %1
680}
681
682;
683; vXi8
684;
685
686define i8 @test_v2i8(<2 x i8> %a0) {
687; SSE2-LABEL: test_v2i8:
688; SSE2: # %bb.0:
689; SSE2-NEXT: movdqa %xmm0, %xmm1
690; SSE2-NEXT: psrlw $8, %xmm1
691; SSE2-NEXT: pand %xmm0, %xmm1
692; SSE2-NEXT: movd %xmm1, %eax
693; SSE2-NEXT: # kill: def $al killed $al killed $eax
694; SSE2-NEXT: retq
695;
696; SSE41-LABEL: test_v2i8:
697; SSE41: # %bb.0:
698; SSE41-NEXT: movdqa %xmm0, %xmm1
699; SSE41-NEXT: psrlw $8, %xmm1
700; SSE41-NEXT: pand %xmm0, %xmm1
701; SSE41-NEXT: pextrb $0, %xmm1, %eax
702; SSE41-NEXT: # kill: def $al killed $al killed $eax
703; SSE41-NEXT: retq
704;
705; AVX-LABEL: test_v2i8:
706; AVX: # %bb.0:
707; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
708; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
709; AVX-NEXT: vpextrb $0, %xmm0, %eax
710; AVX-NEXT: # kill: def $al killed $al killed $eax
711; AVX-NEXT: retq
712 %1 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> %a0)
713 ret i8 %1
714}
715
716define i8 @test_v4i8(<4 x i8> %a0) {
717; SSE2-LABEL: test_v4i8:
718; SSE2: # %bb.0:
719; SSE2-NEXT: movdqa %xmm0, %xmm1
720; SSE2-NEXT: psrld $16, %xmm1
721; SSE2-NEXT: pand %xmm0, %xmm1
722; SSE2-NEXT: movdqa %xmm1, %xmm0
723; SSE2-NEXT: psrlw $8, %xmm0
724; SSE2-NEXT: pand %xmm1, %xmm0
725; SSE2-NEXT: movd %xmm0, %eax
726; SSE2-NEXT: # kill: def $al killed $al killed $eax
727; SSE2-NEXT: retq
728;
729; SSE41-LABEL: test_v4i8:
730; SSE41: # %bb.0:
731; SSE41-NEXT: movdqa %xmm0, %xmm1
732; SSE41-NEXT: psrld $16, %xmm1
733; SSE41-NEXT: pand %xmm0, %xmm1
734; SSE41-NEXT: movdqa %xmm1, %xmm0
735; SSE41-NEXT: psrlw $8, %xmm0
736; SSE41-NEXT: pand %xmm1, %xmm0
737; SSE41-NEXT: pextrb $0, %xmm0, %eax
738; SSE41-NEXT: # kill: def $al killed $al killed $eax
739; SSE41-NEXT: retq
740;
741; AVX-LABEL: test_v4i8:
742; AVX: # %bb.0:
743; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
744; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
745; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
746; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
747; AVX-NEXT: vpextrb $0, %xmm0, %eax
748; AVX-NEXT: # kill: def $al killed $al killed $eax
749; AVX-NEXT: retq
750 %1 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> %a0)
751 ret i8 %1
752}
753
754define i8 @test_v8i8(<8 x i8> %a0) {
755; SSE2-LABEL: test_v8i8:
756; SSE2: # %bb.0:
757; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
758; SSE2-NEXT: pand %xmm0, %xmm1
759; SSE2-NEXT: movdqa %xmm1, %xmm0
760; SSE2-NEXT: psrld $16, %xmm0
761; SSE2-NEXT: pand %xmm1, %xmm0
762; SSE2-NEXT: movdqa %xmm0, %xmm1
763; SSE2-NEXT: psrlw $8, %xmm1
764; SSE2-NEXT: pand %xmm0, %xmm1
765; SSE2-NEXT: movd %xmm1, %eax
766; SSE2-NEXT: # kill: def $al killed $al killed $eax
767; SSE2-NEXT: retq
768;
769; SSE41-LABEL: test_v8i8:
770; SSE41: # %bb.0:
771; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
772; SSE41-NEXT: pand %xmm0, %xmm1
773; SSE41-NEXT: movdqa %xmm1, %xmm0
774; SSE41-NEXT: psrld $16, %xmm0
775; SSE41-NEXT: pand %xmm1, %xmm0
776; SSE41-NEXT: movdqa %xmm0, %xmm1
777; SSE41-NEXT: psrlw $8, %xmm1
778; SSE41-NEXT: pand %xmm0, %xmm1
779; SSE41-NEXT: pextrb $0, %xmm1, %eax
780; SSE41-NEXT: # kill: def $al killed $al killed $eax
781; SSE41-NEXT: retq
782;
783; AVX-LABEL: test_v8i8:
784; AVX: # %bb.0:
785; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
786; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
787; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
788; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
789; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
790; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
791; AVX-NEXT: vpextrb $0, %xmm0, %eax
792; AVX-NEXT: # kill: def $al killed $al killed $eax
793; AVX-NEXT: retq
794 %1 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %a0)
795 ret i8 %1
796}
797
798define i8 @test_v16i8(<16 x i8> %a0) {
799; SSE2-LABEL: test_v16i8:
800; SSE2: # %bb.0:
801; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
802; SSE2-NEXT: pand %xmm0, %xmm1
803; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
804; SSE2-NEXT: pand %xmm1, %xmm0
805; SSE2-NEXT: movdqa %xmm0, %xmm1
806; SSE2-NEXT: psrld $16, %xmm1
807; SSE2-NEXT: pand %xmm0, %xmm1
808; SSE2-NEXT: movdqa %xmm1, %xmm0
809; SSE2-NEXT: psrlw $8, %xmm0
810; SSE2-NEXT: pand %xmm1, %xmm0
811; SSE2-NEXT: movd %xmm0, %eax
812; SSE2-NEXT: # kill: def $al killed $al killed $eax
813; SSE2-NEXT: retq
814;
815; SSE41-LABEL: test_v16i8:
816; SSE41: # %bb.0:
817; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
818; SSE41-NEXT: pand %xmm0, %xmm1
819; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
820; SSE41-NEXT: pand %xmm1, %xmm0
821; SSE41-NEXT: movdqa %xmm0, %xmm1
822; SSE41-NEXT: psrld $16, %xmm1
823; SSE41-NEXT: pand %xmm0, %xmm1
824; SSE41-NEXT: movdqa %xmm1, %xmm0
825; SSE41-NEXT: psrlw $8, %xmm0
826; SSE41-NEXT: pand %xmm1, %xmm0
827; SSE41-NEXT: pextrb $0, %xmm0, %eax
828; SSE41-NEXT: # kill: def $al killed $al killed $eax
829; SSE41-NEXT: retq
830;
831; AVX-LABEL: test_v16i8:
832; AVX: # %bb.0:
833; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
834; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
835; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
836; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
837; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
838; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
839; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
840; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
841; AVX-NEXT: vpextrb $0, %xmm0, %eax
842; AVX-NEXT: # kill: def $al killed $al killed $eax
843; AVX-NEXT: retq
844 %1 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %a0)
845 ret i8 %1
846}
847
848define i8 @test_v32i8(<32 x i8> %a0) {
849; SSE2-LABEL: test_v32i8:
850; SSE2: # %bb.0:
851; SSE2-NEXT: pand %xmm1, %xmm0
852; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
853; SSE2-NEXT: pand %xmm0, %xmm1
854; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
855; SSE2-NEXT: pand %xmm1, %xmm0
856; SSE2-NEXT: movdqa %xmm0, %xmm1
857; SSE2-NEXT: psrld $16, %xmm1
858; SSE2-NEXT: pand %xmm0, %xmm1
859; SSE2-NEXT: movdqa %xmm1, %xmm0
860; SSE2-NEXT: psrlw $8, %xmm0
861; SSE2-NEXT: pand %xmm1, %xmm0
862; SSE2-NEXT: movd %xmm0, %eax
863; SSE2-NEXT: # kill: def $al killed $al killed $eax
864; SSE2-NEXT: retq
865;
866; SSE41-LABEL: test_v32i8:
867; SSE41: # %bb.0:
868; SSE41-NEXT: pand %xmm1, %xmm0
869; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
870; SSE41-NEXT: pand %xmm0, %xmm1
871; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
872; SSE41-NEXT: pand %xmm1, %xmm0
873; SSE41-NEXT: movdqa %xmm0, %xmm1
874; SSE41-NEXT: psrld $16, %xmm1
875; SSE41-NEXT: pand %xmm0, %xmm1
876; SSE41-NEXT: movdqa %xmm1, %xmm0
877; SSE41-NEXT: psrlw $8, %xmm0
878; SSE41-NEXT: pand %xmm1, %xmm0
879; SSE41-NEXT: pextrb $0, %xmm0, %eax
880; SSE41-NEXT: # kill: def $al killed $al killed $eax
881; SSE41-NEXT: retq
882;
883; AVX1-LABEL: test_v32i8:
884; AVX1: # %bb.0:
885; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
886; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
887; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
888; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
889; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
890; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
891; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
892; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
893; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
894; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
895; AVX1-NEXT: vpextrb $0, %xmm0, %eax
896; AVX1-NEXT: # kill: def $al killed $al killed $eax
897; AVX1-NEXT: vzeroupper
898; AVX1-NEXT: retq
899;
900; AVX2-LABEL: test_v32i8:
901; AVX2: # %bb.0:
902; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
903; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
904; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
905; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
906; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
907; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
908; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
909; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
910; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
911; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
912; AVX2-NEXT: vpextrb $0, %xmm0, %eax
913; AVX2-NEXT: # kill: def $al killed $al killed $eax
914; AVX2-NEXT: vzeroupper
915; AVX2-NEXT: retq
916;
917; AVX512-LABEL: test_v32i8:
918; AVX512: # %bb.0:
919; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
920; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
921; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
922; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
923; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
924; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
925; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
926; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
927; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
928; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
929; AVX512-NEXT: vpextrb $0, %xmm0, %eax
930; AVX512-NEXT: # kill: def $al killed $al killed $eax
931; AVX512-NEXT: vzeroupper
932; AVX512-NEXT: retq
933 %1 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %a0)
934 ret i8 %1
935}
936
937define i8 @test_v64i8(<64 x i8> %a0) {
938; SSE2-LABEL: test_v64i8:
939; SSE2: # %bb.0:
940; SSE2-NEXT: pand %xmm3, %xmm1
941; SSE2-NEXT: pand %xmm2, %xmm1
942; SSE2-NEXT: pand %xmm0, %xmm1
943; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
944; SSE2-NEXT: pand %xmm1, %xmm0
945; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
946; SSE2-NEXT: pand %xmm0, %xmm1
947; SSE2-NEXT: movdqa %xmm1, %xmm0
948; SSE2-NEXT: psrld $16, %xmm0
949; SSE2-NEXT: pand %xmm1, %xmm0
950; SSE2-NEXT: movdqa %xmm0, %xmm1
951; SSE2-NEXT: psrlw $8, %xmm1
952; SSE2-NEXT: pand %xmm0, %xmm1
953; SSE2-NEXT: movd %xmm1, %eax
954; SSE2-NEXT: # kill: def $al killed $al killed $eax
955; SSE2-NEXT: retq
956;
957; SSE41-LABEL: test_v64i8:
958; SSE41: # %bb.0:
959; SSE41-NEXT: pand %xmm3, %xmm1
960; SSE41-NEXT: pand %xmm2, %xmm1
961; SSE41-NEXT: pand %xmm0, %xmm1
962; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
963; SSE41-NEXT: pand %xmm1, %xmm0
964; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
965; SSE41-NEXT: pand %xmm0, %xmm1
966; SSE41-NEXT: movdqa %xmm1, %xmm0
967; SSE41-NEXT: psrld $16, %xmm0
968; SSE41-NEXT: pand %xmm1, %xmm0
969; SSE41-NEXT: movdqa %xmm0, %xmm1
970; SSE41-NEXT: psrlw $8, %xmm1
971; SSE41-NEXT: pand %xmm0, %xmm1
972; SSE41-NEXT: pextrb $0, %xmm1, %eax
973; SSE41-NEXT: # kill: def $al killed $al killed $eax
974; SSE41-NEXT: retq
975;
976; AVX1-LABEL: test_v64i8:
977; AVX1: # %bb.0:
978; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
979; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
980; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
981; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
982; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
983; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
984; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
985; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
986; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
987; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
988; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
989; AVX1-NEXT: vpextrb $0, %xmm0, %eax
990; AVX1-NEXT: # kill: def $al killed $al killed $eax
991; AVX1-NEXT: vzeroupper
992; AVX1-NEXT: retq
993;
994; AVX2-LABEL: test_v64i8:
995; AVX2: # %bb.0:
996; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
997; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
998; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
999; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1000; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1001; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1002; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1003; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1004; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1005; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1006; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1007; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1008; AVX2-NEXT: # kill: def $al killed $al killed $eax
1009; AVX2-NEXT: vzeroupper
1010; AVX2-NEXT: retq
1011;
1012; AVX512-LABEL: test_v64i8:
1013; AVX512: # %bb.0:
1014; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1015; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1016; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1017; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1018; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1019; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1020; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1021; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1022; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1023; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1024; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1025; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1026; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1027; AVX512-NEXT: # kill: def $al killed $al killed $eax
1028; AVX512-NEXT: vzeroupper
1029; AVX512-NEXT: retq
1030 %1 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> %a0)
1031 ret i8 %1
1032}
1033
1034define i8 @test_v128i8(<128 x i8> %a0) {
1035; SSE2-LABEL: test_v128i8:
1036; SSE2: # %bb.0:
1037; SSE2-NEXT: pand %xmm6, %xmm2
1038; SSE2-NEXT: pand %xmm7, %xmm3
1039; SSE2-NEXT: pand %xmm5, %xmm3
1040; SSE2-NEXT: pand %xmm1, %xmm3
1041; SSE2-NEXT: pand %xmm4, %xmm2
1042; SSE2-NEXT: pand %xmm3, %xmm2
1043; SSE2-NEXT: pand %xmm0, %xmm2
1044; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1045; SSE2-NEXT: pand %xmm2, %xmm0
1046; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1047; SSE2-NEXT: pand %xmm0, %xmm1
1048; SSE2-NEXT: movdqa %xmm1, %xmm0
1049; SSE2-NEXT: psrld $16, %xmm0
1050; SSE2-NEXT: pand %xmm1, %xmm0
1051; SSE2-NEXT: movdqa %xmm0, %xmm1
1052; SSE2-NEXT: psrlw $8, %xmm1
1053; SSE2-NEXT: pand %xmm0, %xmm1
1054; SSE2-NEXT: movd %xmm1, %eax
1055; SSE2-NEXT: # kill: def $al killed $al killed $eax
1056; SSE2-NEXT: retq
1057;
1058; SSE41-LABEL: test_v128i8:
1059; SSE41: # %bb.0:
1060; SSE41-NEXT: pand %xmm6, %xmm2
1061; SSE41-NEXT: pand %xmm7, %xmm3
1062; SSE41-NEXT: pand %xmm5, %xmm3
1063; SSE41-NEXT: pand %xmm1, %xmm3
1064; SSE41-NEXT: pand %xmm4, %xmm2
1065; SSE41-NEXT: pand %xmm3, %xmm2
1066; SSE41-NEXT: pand %xmm0, %xmm2
1067; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1068; SSE41-NEXT: pand %xmm2, %xmm0
1069; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1070; SSE41-NEXT: pand %xmm0, %xmm1
1071; SSE41-NEXT: movdqa %xmm1, %xmm0
1072; SSE41-NEXT: psrld $16, %xmm0
1073; SSE41-NEXT: pand %xmm1, %xmm0
1074; SSE41-NEXT: movdqa %xmm0, %xmm1
1075; SSE41-NEXT: psrlw $8, %xmm1
1076; SSE41-NEXT: pand %xmm0, %xmm1
1077; SSE41-NEXT: pextrb $0, %xmm1, %eax
1078; SSE41-NEXT: # kill: def $al killed $al killed $eax
1079; SSE41-NEXT: retq
1080;
1081; AVX1-LABEL: test_v128i8:
1082; AVX1: # %bb.0:
1083; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
1084; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
1085; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
1086; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1087; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
1088; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1089; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
1090; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
1091; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
1092; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1093; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1094; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1095; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1096; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1097; AVX1-NEXT: # kill: def $al killed $al killed $eax
1098; AVX1-NEXT: vzeroupper
1099; AVX1-NEXT: retq
1100;
1101; AVX2-LABEL: test_v128i8:
1102; AVX2: # %bb.0:
1103; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
1104; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
1105; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1106; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1107; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1108; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1109; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1110; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1111; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1112; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1113; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1114; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1115; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1116; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1117; AVX2-NEXT: # kill: def $al killed $al killed $eax
1118; AVX2-NEXT: vzeroupper
1119; AVX2-NEXT: retq
1120;
1121; AVX512-LABEL: test_v128i8:
1122; AVX512: # %bb.0:
1123; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1124; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1125; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1126; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1127; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1128; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1129; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1130; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1131; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1132; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1133; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1134; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1135; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1136; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1137; AVX512-NEXT: # kill: def $al killed $al killed $eax
1138; AVX512-NEXT: vzeroupper
1139; AVX512-NEXT: retq
1140 %1 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> %a0)
1141 ret i8 %1
1142}
1143
1144declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
1145declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>)
1146declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>)
1147declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>)
1148
1149declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>)
1150declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
1151declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>)
1152declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>)
1153declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>)
1154
1155declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>)
1156declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>)
1157declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>)
1158declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>)
1159declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>)
1160declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>)
1161
1162declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>)
1163declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>)
1164declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>)
1165declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>)
1166declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>)
1167declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>)
1168declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>)