blob: a59eabb177ec5baa0f3c1450d37b744554506782 [file] [log] [blame]
Craig Topper39910892018-12-05 06:29:44 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
8
9;
10; vXi64
11;
12
13define i64 @test_v2i64(<2 x i64> %a0) {
14; SSE-LABEL: test_v2i64:
15; SSE: # %bb.0:
16; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
17; SSE-NEXT: paddq %xmm0, %xmm1
18; SSE-NEXT: movq %xmm1, %rax
19; SSE-NEXT: retq
20;
21; AVX-LABEL: test_v2i64:
22; AVX: # %bb.0:
23; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
24; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
25; AVX-NEXT: vmovq %xmm0, %rax
26; AVX-NEXT: retq
27;
28; AVX512-LABEL: test_v2i64:
29; AVX512: # %bb.0:
30; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
31; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
32; AVX512-NEXT: vmovq %xmm0, %rax
33; AVX512-NEXT: retq
34 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %a0)
35 ret i64 %1
36}
37
38define i64 @test_v4i64(<4 x i64> %a0) {
39; SSE-LABEL: test_v4i64:
40; SSE: # %bb.0:
41; SSE-NEXT: paddq %xmm1, %xmm0
42; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
43; SSE-NEXT: paddq %xmm0, %xmm1
44; SSE-NEXT: movq %xmm1, %rax
45; SSE-NEXT: retq
46;
47; AVX1-LABEL: test_v4i64:
48; AVX1: # %bb.0:
49; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
50; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
51; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
52; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
53; AVX1-NEXT: vmovq %xmm0, %rax
54; AVX1-NEXT: vzeroupper
55; AVX1-NEXT: retq
56;
57; AVX2-LABEL: test_v4i64:
58; AVX2: # %bb.0:
59; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +000060; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +000061; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
62; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
63; AVX2-NEXT: vmovq %xmm0, %rax
64; AVX2-NEXT: vzeroupper
65; AVX2-NEXT: retq
66;
67; AVX512-LABEL: test_v4i64:
68; AVX512: # %bb.0:
69; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +000070; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +000071; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
72; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
73; AVX512-NEXT: vmovq %xmm0, %rax
74; AVX512-NEXT: vzeroupper
75; AVX512-NEXT: retq
76 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> %a0)
77 ret i64 %1
78}
79
80define i64 @test_v8i64(<8 x i64> %a0) {
81; SSE-LABEL: test_v8i64:
82; SSE: # %bb.0:
83; SSE-NEXT: paddq %xmm3, %xmm1
84; SSE-NEXT: paddq %xmm2, %xmm1
85; SSE-NEXT: paddq %xmm0, %xmm1
86; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
87; SSE-NEXT: paddq %xmm1, %xmm0
88; SSE-NEXT: movq %xmm0, %rax
89; SSE-NEXT: retq
90;
91; AVX1-LABEL: test_v8i64:
92; AVX1: # %bb.0:
93; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
94; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
95; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
96; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
97; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
98; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
99; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
100; AVX1-NEXT: vmovq %xmm0, %rax
101; AVX1-NEXT: vzeroupper
102; AVX1-NEXT: retq
103;
104; AVX2-LABEL: test_v8i64:
105; AVX2: # %bb.0:
106; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
107; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000108; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000109; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
110; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
111; AVX2-NEXT: vmovq %xmm0, %rax
112; AVX2-NEXT: vzeroupper
113; AVX2-NEXT: retq
114;
115; AVX512-LABEL: test_v8i64:
116; AVX512: # %bb.0:
117; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
118; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
119; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patela61d5862019-01-29 19:13:39 +0000120; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000121; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
122; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
123; AVX512-NEXT: vmovq %xmm0, %rax
124; AVX512-NEXT: vzeroupper
125; AVX512-NEXT: retq
126 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> %a0)
127 ret i64 %1
128}
129
130define i64 @test_v16i64(<16 x i64> %a0) {
131; SSE-LABEL: test_v16i64:
132; SSE: # %bb.0:
133; SSE-NEXT: paddq %xmm6, %xmm2
134; SSE-NEXT: paddq %xmm7, %xmm3
135; SSE-NEXT: paddq %xmm5, %xmm3
136; SSE-NEXT: paddq %xmm1, %xmm3
137; SSE-NEXT: paddq %xmm4, %xmm2
138; SSE-NEXT: paddq %xmm3, %xmm2
139; SSE-NEXT: paddq %xmm0, %xmm2
140; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
141; SSE-NEXT: paddq %xmm2, %xmm0
142; SSE-NEXT: movq %xmm0, %rax
143; SSE-NEXT: retq
144;
145; AVX1-LABEL: test_v16i64:
146; AVX1: # %bb.0:
147; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4
148; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
149; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
150; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
151; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
152; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
153; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
154; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
155; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
156; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
157; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
158; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
159; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
160; AVX1-NEXT: vmovq %xmm0, %rax
161; AVX1-NEXT: vzeroupper
162; AVX1-NEXT: retq
163;
164; AVX2-LABEL: test_v16i64:
165; AVX2: # %bb.0:
166; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
167; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
168; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
169; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000170; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000171; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
172; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
173; AVX2-NEXT: vmovq %xmm0, %rax
174; AVX2-NEXT: vzeroupper
175; AVX2-NEXT: retq
176;
177; AVX512-LABEL: test_v16i64:
178; AVX512: # %bb.0:
179; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
180; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
181; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
182; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patela61d5862019-01-29 19:13:39 +0000183; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000184; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
185; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
186; AVX512-NEXT: vmovq %xmm0, %rax
187; AVX512-NEXT: vzeroupper
188; AVX512-NEXT: retq
189 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> %a0)
190 ret i64 %1
191}
192
193;
194; vXi32
195;
196
197define i32 @test_v2i32(<2 x i32> %a0) {
198; SSE-LABEL: test_v2i32:
199; SSE: # %bb.0:
200; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
201; SSE-NEXT: paddd %xmm0, %xmm1
202; SSE-NEXT: movd %xmm1, %eax
203; SSE-NEXT: retq
204;
205; AVX-LABEL: test_v2i32:
206; AVX: # %bb.0:
207; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
208; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
209; AVX-NEXT: vmovd %xmm0, %eax
210; AVX-NEXT: retq
211;
212; AVX512-LABEL: test_v2i32:
213; AVX512: # %bb.0:
214; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
215; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
216; AVX512-NEXT: vmovd %xmm0, %eax
217; AVX512-NEXT: retq
218 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> %a0)
219 ret i32 %1
220}
221
222define i32 @test_v4i32(<4 x i32> %a0) {
223; SSE-LABEL: test_v4i32:
224; SSE: # %bb.0:
225; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
226; SSE-NEXT: paddd %xmm0, %xmm1
227; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
228; SSE-NEXT: paddd %xmm1, %xmm0
229; SSE-NEXT: movd %xmm0, %eax
230; SSE-NEXT: retq
231;
232; AVX-LABEL: test_v4i32:
233; AVX: # %bb.0:
234; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
235; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
236; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
237; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
238; AVX-NEXT: vmovd %xmm0, %eax
239; AVX-NEXT: retq
240;
241; AVX512-LABEL: test_v4i32:
242; AVX512: # %bb.0:
243; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
244; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
245; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
246; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
247; AVX512-NEXT: vmovd %xmm0, %eax
248; AVX512-NEXT: retq
249 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a0)
250 ret i32 %1
251}
252
253define i32 @test_v8i32(<8 x i32> %a0) {
254; SSE-LABEL: test_v8i32:
255; SSE: # %bb.0:
256; SSE-NEXT: paddd %xmm1, %xmm0
257; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
258; SSE-NEXT: paddd %xmm0, %xmm1
259; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
260; SSE-NEXT: paddd %xmm1, %xmm0
261; SSE-NEXT: movd %xmm0, %eax
262; SSE-NEXT: retq
263;
264; AVX1-LABEL: test_v8i32:
265; AVX1: # %bb.0:
266; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
267; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
268; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
269; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
270; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
271; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
272; AVX1-NEXT: vmovd %xmm0, %eax
273; AVX1-NEXT: vzeroupper
274; AVX1-NEXT: retq
275;
276; AVX2-LABEL: test_v8i32:
277; AVX2: # %bb.0:
278; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000279; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000280; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000281; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000282; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
283; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
284; AVX2-NEXT: vmovd %xmm0, %eax
285; AVX2-NEXT: vzeroupper
286; AVX2-NEXT: retq
287;
288; AVX512-LABEL: test_v8i32:
289; AVX512: # %bb.0:
290; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000291; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000292; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000293; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000294; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
295; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
296; AVX512-NEXT: vmovd %xmm0, %eax
297; AVX512-NEXT: vzeroupper
298; AVX512-NEXT: retq
299 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %a0)
300 ret i32 %1
301}
302
303define i32 @test_v16i32(<16 x i32> %a0) {
304; SSE-LABEL: test_v16i32:
305; SSE: # %bb.0:
306; SSE-NEXT: paddd %xmm3, %xmm1
307; SSE-NEXT: paddd %xmm2, %xmm1
308; SSE-NEXT: paddd %xmm0, %xmm1
309; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
310; SSE-NEXT: paddd %xmm1, %xmm0
311; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
312; SSE-NEXT: paddd %xmm0, %xmm1
313; SSE-NEXT: movd %xmm1, %eax
314; SSE-NEXT: retq
315;
316; AVX1-LABEL: test_v16i32:
317; AVX1: # %bb.0:
318; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
319; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
320; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
321; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
322; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
323; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
324; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
325; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
326; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
327; AVX1-NEXT: vmovd %xmm0, %eax
328; AVX1-NEXT: vzeroupper
329; AVX1-NEXT: retq
330;
331; AVX2-LABEL: test_v16i32:
332; AVX2: # %bb.0:
333; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
334; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000335; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000336; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000337; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000338; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
339; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
340; AVX2-NEXT: vmovd %xmm0, %eax
341; AVX2-NEXT: vzeroupper
342; AVX2-NEXT: retq
343;
344; AVX512-LABEL: test_v16i32:
345; AVX512: # %bb.0:
346; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
347; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
348; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patela61d5862019-01-29 19:13:39 +0000349; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000350; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patela61d5862019-01-29 19:13:39 +0000351; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000352; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
353; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
354; AVX512-NEXT: vmovd %xmm0, %eax
355; AVX512-NEXT: vzeroupper
356; AVX512-NEXT: retq
357 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> %a0)
358 ret i32 %1
359}
360
361define i32 @test_v32i32(<32 x i32> %a0) {
362; SSE-LABEL: test_v32i32:
363; SSE: # %bb.0:
364; SSE-NEXT: paddd %xmm6, %xmm2
365; SSE-NEXT: paddd %xmm7, %xmm3
366; SSE-NEXT: paddd %xmm5, %xmm3
367; SSE-NEXT: paddd %xmm1, %xmm3
368; SSE-NEXT: paddd %xmm4, %xmm2
369; SSE-NEXT: paddd %xmm3, %xmm2
370; SSE-NEXT: paddd %xmm0, %xmm2
371; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
372; SSE-NEXT: paddd %xmm2, %xmm0
373; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
374; SSE-NEXT: paddd %xmm0, %xmm1
375; SSE-NEXT: movd %xmm1, %eax
376; SSE-NEXT: retq
377;
378; AVX1-LABEL: test_v32i32:
379; AVX1: # %bb.0:
380; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm4
381; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
382; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
383; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
384; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
385; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
386; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
387; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
388; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
389; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
390; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
391; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
392; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
393; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
394; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
395; AVX1-NEXT: vmovd %xmm0, %eax
396; AVX1-NEXT: vzeroupper
397; AVX1-NEXT: retq
398;
399; AVX2-LABEL: test_v32i32:
400; AVX2: # %bb.0:
401; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
402; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
403; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
404; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000405; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000406; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000407; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000408; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
409; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
410; AVX2-NEXT: vmovd %xmm0, %eax
411; AVX2-NEXT: vzeroupper
412; AVX2-NEXT: retq
413;
414; AVX512-LABEL: test_v32i32:
415; AVX512: # %bb.0:
416; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
417; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
418; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
419; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patela61d5862019-01-29 19:13:39 +0000420; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000421; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patela61d5862019-01-29 19:13:39 +0000422; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000423; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
424; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
425; AVX512-NEXT: vmovd %xmm0, %eax
426; AVX512-NEXT: vzeroupper
427; AVX512-NEXT: retq
428 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> %a0)
429 ret i32 %1
430}
431
432;
433; vXi16
434;
435
436define i16 @test_v2i16(<2 x i16> %a0) {
437; SSE-LABEL: test_v2i16:
438; SSE: # %bb.0:
439; SSE-NEXT: movdqa %xmm0, %xmm1
440; SSE-NEXT: psrld $16, %xmm1
441; SSE-NEXT: paddw %xmm0, %xmm1
442; SSE-NEXT: movd %xmm1, %eax
443; SSE-NEXT: # kill: def $ax killed $ax killed $eax
444; SSE-NEXT: retq
445;
446; AVX-LABEL: test_v2i16:
447; AVX: # %bb.0:
448; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
449; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
450; AVX-NEXT: vmovd %xmm0, %eax
451; AVX-NEXT: # kill: def $ax killed $ax killed $eax
452; AVX-NEXT: retq
453;
454; AVX512-LABEL: test_v2i16:
455; AVX512: # %bb.0:
456; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
457; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
458; AVX512-NEXT: vmovd %xmm0, %eax
459; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
460; AVX512-NEXT: retq
461 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> %a0)
462 ret i16 %1
463}
464
465define i16 @test_v4i16(<4 x i16> %a0) {
466; SSE-LABEL: test_v4i16:
467; SSE: # %bb.0:
468; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
469; SSE-NEXT: paddw %xmm0, %xmm1
470; SSE-NEXT: movdqa %xmm1, %xmm0
471; SSE-NEXT: psrld $16, %xmm0
472; SSE-NEXT: paddw %xmm1, %xmm0
473; SSE-NEXT: movd %xmm0, %eax
474; SSE-NEXT: # kill: def $ax killed $ax killed $eax
475; SSE-NEXT: retq
476;
477; AVX-LABEL: test_v4i16:
478; AVX: # %bb.0:
479; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
480; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
481; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
482; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
483; AVX-NEXT: vmovd %xmm0, %eax
484; AVX-NEXT: # kill: def $ax killed $ax killed $eax
485; AVX-NEXT: retq
486;
487; AVX512-LABEL: test_v4i16:
488; AVX512: # %bb.0:
489; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
490; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
491; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
492; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
493; AVX512-NEXT: vmovd %xmm0, %eax
494; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
495; AVX512-NEXT: retq
496 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> %a0)
497 ret i16 %1
498}
499
500define i16 @test_v8i16(<8 x i16> %a0) {
501; SSE-LABEL: test_v8i16:
502; SSE: # %bb.0:
503; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
504; SSE-NEXT: paddw %xmm0, %xmm1
505; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
506; SSE-NEXT: paddw %xmm1, %xmm0
507; SSE-NEXT: movdqa %xmm0, %xmm1
508; SSE-NEXT: psrld $16, %xmm1
509; SSE-NEXT: paddw %xmm0, %xmm1
510; SSE-NEXT: movd %xmm1, %eax
511; SSE-NEXT: # kill: def $ax killed $ax killed $eax
512; SSE-NEXT: retq
513;
514; AVX-LABEL: test_v8i16:
515; AVX: # %bb.0:
516; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
517; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
518; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
519; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
520; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
521; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
522; AVX-NEXT: vmovd %xmm0, %eax
523; AVX-NEXT: # kill: def $ax killed $ax killed $eax
524; AVX-NEXT: retq
525;
526; AVX512-LABEL: test_v8i16:
527; AVX512: # %bb.0:
528; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
529; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
530; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
531; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
532; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
533; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
534; AVX512-NEXT: vmovd %xmm0, %eax
535; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
536; AVX512-NEXT: retq
537 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %a0)
538 ret i16 %1
539}
540
541define i16 @test_v16i16(<16 x i16> %a0) {
542; SSE-LABEL: test_v16i16:
543; SSE: # %bb.0:
544; SSE-NEXT: paddw %xmm1, %xmm0
545; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
546; SSE-NEXT: paddw %xmm0, %xmm1
547; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
548; SSE-NEXT: paddw %xmm1, %xmm0
549; SSE-NEXT: movdqa %xmm0, %xmm1
550; SSE-NEXT: psrld $16, %xmm1
551; SSE-NEXT: paddw %xmm0, %xmm1
552; SSE-NEXT: movd %xmm1, %eax
553; SSE-NEXT: # kill: def $ax killed $ax killed $eax
554; SSE-NEXT: retq
555;
556; AVX1-LABEL: test_v16i16:
557; AVX1: # %bb.0:
558; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
559; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
560; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
561; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
562; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
563; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
564; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
565; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
566; AVX1-NEXT: vmovd %xmm0, %eax
567; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
568; AVX1-NEXT: vzeroupper
569; AVX1-NEXT: retq
570;
571; AVX2-LABEL: test_v16i16:
572; AVX2: # %bb.0:
573; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000574; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000575; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000576; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000577; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000578; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000579; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
580; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
581; AVX2-NEXT: vmovd %xmm0, %eax
582; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
583; AVX2-NEXT: vzeroupper
584; AVX2-NEXT: retq
585;
586; AVX512-LABEL: test_v16i16:
587; AVX512: # %bb.0:
588; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000589; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000590; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000591; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000592; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000593; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000594; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
595; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
596; AVX512-NEXT: vmovd %xmm0, %eax
597; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
598; AVX512-NEXT: vzeroupper
599; AVX512-NEXT: retq
600 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %a0)
601 ret i16 %1
602}
603
604define i16 @test_v32i16(<32 x i16> %a0) {
605; SSE-LABEL: test_v32i16:
606; SSE: # %bb.0:
607; SSE-NEXT: paddw %xmm3, %xmm1
608; SSE-NEXT: paddw %xmm2, %xmm1
609; SSE-NEXT: paddw %xmm0, %xmm1
610; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
611; SSE-NEXT: paddw %xmm1, %xmm0
612; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
613; SSE-NEXT: paddw %xmm0, %xmm1
614; SSE-NEXT: movdqa %xmm1, %xmm0
615; SSE-NEXT: psrld $16, %xmm0
616; SSE-NEXT: paddw %xmm1, %xmm0
617; SSE-NEXT: movd %xmm0, %eax
618; SSE-NEXT: # kill: def $ax killed $ax killed $eax
619; SSE-NEXT: retq
620;
621; AVX1-LABEL: test_v32i16:
622; AVX1: # %bb.0:
623; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
624; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
625; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
626; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
627; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
628; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
629; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
630; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
631; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
632; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
633; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
634; AVX1-NEXT: vmovd %xmm0, %eax
635; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
636; AVX1-NEXT: vzeroupper
637; AVX1-NEXT: retq
638;
639; AVX2-LABEL: test_v32i16:
640; AVX2: # %bb.0:
641; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
642; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000643; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000644; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000645; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000646; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000647; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000648; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
649; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
650; AVX2-NEXT: vmovd %xmm0, %eax
651; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
652; AVX2-NEXT: vzeroupper
653; AVX2-NEXT: retq
654;
655; AVX512-LABEL: test_v32i16:
656; AVX512: # %bb.0:
657; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
658; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
659; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patela61d5862019-01-29 19:13:39 +0000660; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000661; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patela61d5862019-01-29 19:13:39 +0000662; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000663; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
Sanjay Patela61d5862019-01-29 19:13:39 +0000664; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000665; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
666; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
667; AVX512-NEXT: vmovd %xmm0, %eax
668; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
669; AVX512-NEXT: vzeroupper
670; AVX512-NEXT: retq
671 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> %a0)
672 ret i16 %1
673}
674
675define i16 @test_v64i16(<64 x i16> %a0) {
676; SSE-LABEL: test_v64i16:
677; SSE: # %bb.0:
678; SSE-NEXT: paddw %xmm6, %xmm2
679; SSE-NEXT: paddw %xmm7, %xmm3
680; SSE-NEXT: paddw %xmm5, %xmm3
681; SSE-NEXT: paddw %xmm1, %xmm3
682; SSE-NEXT: paddw %xmm4, %xmm2
683; SSE-NEXT: paddw %xmm3, %xmm2
684; SSE-NEXT: paddw %xmm0, %xmm2
685; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
686; SSE-NEXT: paddw %xmm2, %xmm0
687; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
688; SSE-NEXT: paddw %xmm0, %xmm1
689; SSE-NEXT: movdqa %xmm1, %xmm0
690; SSE-NEXT: psrld $16, %xmm0
691; SSE-NEXT: paddw %xmm1, %xmm0
692; SSE-NEXT: movd %xmm0, %eax
693; SSE-NEXT: # kill: def $ax killed $ax killed $eax
694; SSE-NEXT: retq
695;
696; AVX1-LABEL: test_v64i16:
697; AVX1: # %bb.0:
698; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm4
699; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
700; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
701; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
702; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
703; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
704; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
705; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
706; AVX1-NEXT: vpaddw %xmm4, %xmm2, %xmm2
707; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1
708; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
709; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
710; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
711; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
712; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
713; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
714; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
715; AVX1-NEXT: vmovd %xmm0, %eax
716; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
717; AVX1-NEXT: vzeroupper
718; AVX1-NEXT: retq
719;
720; AVX2-LABEL: test_v64i16:
721; AVX2: # %bb.0:
722; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
723; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
724; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
725; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000726; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000727; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000728; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000729; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +0000730; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000731; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
732; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
733; AVX2-NEXT: vmovd %xmm0, %eax
734; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
735; AVX2-NEXT: vzeroupper
736; AVX2-NEXT: retq
737;
738; AVX512-LABEL: test_v64i16:
739; AVX512: # %bb.0:
740; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
741; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
742; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
743; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patela61d5862019-01-29 19:13:39 +0000744; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000745; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patela61d5862019-01-29 19:13:39 +0000746; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000747; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
Sanjay Patela61d5862019-01-29 19:13:39 +0000748; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +0000749; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
750; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
751; AVX512-NEXT: vmovd %xmm0, %eax
752; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
753; AVX512-NEXT: vzeroupper
754; AVX512-NEXT: retq
755 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> %a0)
756 ret i16 %1
757}
758
759;
760; vXi8
761;
762
763define i8 @test_v2i8(<2 x i8> %a0) {
764; SSE2-LABEL: test_v2i8:
765; SSE2: # %bb.0:
766; SSE2-NEXT: movdqa %xmm0, %xmm1
767; SSE2-NEXT: psrlw $8, %xmm1
768; SSE2-NEXT: paddb %xmm0, %xmm1
769; SSE2-NEXT: movd %xmm1, %eax
770; SSE2-NEXT: # kill: def $al killed $al killed $eax
771; SSE2-NEXT: retq
772;
773; SSE41-LABEL: test_v2i8:
774; SSE41: # %bb.0:
775; SSE41-NEXT: movdqa %xmm0, %xmm1
776; SSE41-NEXT: psrlw $8, %xmm1
777; SSE41-NEXT: paddb %xmm0, %xmm1
778; SSE41-NEXT: pextrb $0, %xmm1, %eax
779; SSE41-NEXT: # kill: def $al killed $al killed $eax
780; SSE41-NEXT: retq
781;
782; AVX-LABEL: test_v2i8:
783; AVX: # %bb.0:
784; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
785; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
786; AVX-NEXT: vpextrb $0, %xmm0, %eax
787; AVX-NEXT: # kill: def $al killed $al killed $eax
788; AVX-NEXT: retq
789;
790; AVX512-LABEL: test_v2i8:
791; AVX512: # %bb.0:
792; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
793; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
794; AVX512-NEXT: vpextrb $0, %xmm0, %eax
795; AVX512-NEXT: # kill: def $al killed $al killed $eax
796; AVX512-NEXT: retq
797 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> %a0)
798 ret i8 %1
799}
800
801define i8 @test_v4i8(<4 x i8> %a0) {
802; SSE2-LABEL: test_v4i8:
803; SSE2: # %bb.0:
804; SSE2-NEXT: movdqa %xmm0, %xmm1
805; SSE2-NEXT: psrld $16, %xmm1
806; SSE2-NEXT: paddb %xmm0, %xmm1
807; SSE2-NEXT: movdqa %xmm1, %xmm0
808; SSE2-NEXT: psrlw $8, %xmm0
809; SSE2-NEXT: paddb %xmm1, %xmm0
810; SSE2-NEXT: movd %xmm0, %eax
811; SSE2-NEXT: # kill: def $al killed $al killed $eax
812; SSE2-NEXT: retq
813;
814; SSE41-LABEL: test_v4i8:
815; SSE41: # %bb.0:
816; SSE41-NEXT: movdqa %xmm0, %xmm1
817; SSE41-NEXT: psrld $16, %xmm1
818; SSE41-NEXT: paddb %xmm0, %xmm1
819; SSE41-NEXT: movdqa %xmm1, %xmm0
820; SSE41-NEXT: psrlw $8, %xmm0
821; SSE41-NEXT: paddb %xmm1, %xmm0
822; SSE41-NEXT: pextrb $0, %xmm0, %eax
823; SSE41-NEXT: # kill: def $al killed $al killed $eax
824; SSE41-NEXT: retq
825;
826; AVX-LABEL: test_v4i8:
827; AVX: # %bb.0:
828; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
829; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
830; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
831; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
832; AVX-NEXT: vpextrb $0, %xmm0, %eax
833; AVX-NEXT: # kill: def $al killed $al killed $eax
834; AVX-NEXT: retq
835;
836; AVX512-LABEL: test_v4i8:
837; AVX512: # %bb.0:
838; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
839; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
840; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
841; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
842; AVX512-NEXT: vpextrb $0, %xmm0, %eax
843; AVX512-NEXT: # kill: def $al killed $al killed $eax
844; AVX512-NEXT: retq
845 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> %a0)
846 ret i8 %1
847}
848
849define i8 @test_v8i8(<8 x i8> %a0) {
850; SSE2-LABEL: test_v8i8:
851; SSE2: # %bb.0:
852; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
853; SSE2-NEXT: paddb %xmm0, %xmm1
854; SSE2-NEXT: movdqa %xmm1, %xmm0
855; SSE2-NEXT: psrld $16, %xmm0
856; SSE2-NEXT: paddb %xmm1, %xmm0
857; SSE2-NEXT: movdqa %xmm0, %xmm1
858; SSE2-NEXT: psrlw $8, %xmm1
859; SSE2-NEXT: paddb %xmm0, %xmm1
860; SSE2-NEXT: movd %xmm1, %eax
861; SSE2-NEXT: # kill: def $al killed $al killed $eax
862; SSE2-NEXT: retq
863;
864; SSE41-LABEL: test_v8i8:
865; SSE41: # %bb.0:
866; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
867; SSE41-NEXT: paddb %xmm0, %xmm1
868; SSE41-NEXT: movdqa %xmm1, %xmm0
869; SSE41-NEXT: psrld $16, %xmm0
870; SSE41-NEXT: paddb %xmm1, %xmm0
871; SSE41-NEXT: movdqa %xmm0, %xmm1
872; SSE41-NEXT: psrlw $8, %xmm1
873; SSE41-NEXT: paddb %xmm0, %xmm1
874; SSE41-NEXT: pextrb $0, %xmm1, %eax
875; SSE41-NEXT: # kill: def $al killed $al killed $eax
876; SSE41-NEXT: retq
877;
878; AVX-LABEL: test_v8i8:
879; AVX: # %bb.0:
880; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
881; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
882; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
883; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
884; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
885; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
886; AVX-NEXT: vpextrb $0, %xmm0, %eax
887; AVX-NEXT: # kill: def $al killed $al killed $eax
888; AVX-NEXT: retq
889;
890; AVX512-LABEL: test_v8i8:
891; AVX512: # %bb.0:
892; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
893; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
894; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
895; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
896; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
897; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
898; AVX512-NEXT: vpextrb $0, %xmm0, %eax
899; AVX512-NEXT: # kill: def $al killed $al killed $eax
900; AVX512-NEXT: retq
901 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> %a0)
902 ret i8 %1
903}
904
905define i8 @test_v16i8(<16 x i8> %a0) {
906; SSE2-LABEL: test_v16i8:
907; SSE2: # %bb.0:
908; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
909; SSE2-NEXT: paddb %xmm0, %xmm1
910; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
911; SSE2-NEXT: paddb %xmm1, %xmm0
912; SSE2-NEXT: movdqa %xmm0, %xmm1
913; SSE2-NEXT: psrld $16, %xmm1
914; SSE2-NEXT: paddb %xmm0, %xmm1
915; SSE2-NEXT: movdqa %xmm1, %xmm0
916; SSE2-NEXT: psrlw $8, %xmm0
917; SSE2-NEXT: paddb %xmm1, %xmm0
918; SSE2-NEXT: movd %xmm0, %eax
919; SSE2-NEXT: # kill: def $al killed $al killed $eax
920; SSE2-NEXT: retq
921;
922; SSE41-LABEL: test_v16i8:
923; SSE41: # %bb.0:
924; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
925; SSE41-NEXT: paddb %xmm0, %xmm1
926; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
927; SSE41-NEXT: paddb %xmm1, %xmm0
928; SSE41-NEXT: movdqa %xmm0, %xmm1
929; SSE41-NEXT: psrld $16, %xmm1
930; SSE41-NEXT: paddb %xmm0, %xmm1
931; SSE41-NEXT: movdqa %xmm1, %xmm0
932; SSE41-NEXT: psrlw $8, %xmm0
933; SSE41-NEXT: paddb %xmm1, %xmm0
934; SSE41-NEXT: pextrb $0, %xmm0, %eax
935; SSE41-NEXT: # kill: def $al killed $al killed $eax
936; SSE41-NEXT: retq
937;
938; AVX-LABEL: test_v16i8:
939; AVX: # %bb.0:
940; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
941; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
942; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
943; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
944; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
945; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
946; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
947; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
948; AVX-NEXT: vpextrb $0, %xmm0, %eax
949; AVX-NEXT: # kill: def $al killed $al killed $eax
950; AVX-NEXT: retq
951;
952; AVX512-LABEL: test_v16i8:
953; AVX512: # %bb.0:
954; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
955; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
956; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
957; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
958; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
959; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
960; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
961; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
962; AVX512-NEXT: vpextrb $0, %xmm0, %eax
963; AVX512-NEXT: # kill: def $al killed $al killed $eax
964; AVX512-NEXT: retq
965 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %a0)
966 ret i8 %1
967}
968
969define i8 @test_v32i8(<32 x i8> %a0) {
970; SSE2-LABEL: test_v32i8:
971; SSE2: # %bb.0:
972; SSE2-NEXT: paddb %xmm1, %xmm0
973; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
974; SSE2-NEXT: paddb %xmm0, %xmm1
975; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
976; SSE2-NEXT: paddb %xmm1, %xmm0
977; SSE2-NEXT: movdqa %xmm0, %xmm1
978; SSE2-NEXT: psrld $16, %xmm1
979; SSE2-NEXT: paddb %xmm0, %xmm1
980; SSE2-NEXT: movdqa %xmm1, %xmm0
981; SSE2-NEXT: psrlw $8, %xmm0
982; SSE2-NEXT: paddb %xmm1, %xmm0
983; SSE2-NEXT: movd %xmm0, %eax
984; SSE2-NEXT: # kill: def $al killed $al killed $eax
985; SSE2-NEXT: retq
986;
987; SSE41-LABEL: test_v32i8:
988; SSE41: # %bb.0:
989; SSE41-NEXT: paddb %xmm1, %xmm0
990; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
991; SSE41-NEXT: paddb %xmm0, %xmm1
992; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
993; SSE41-NEXT: paddb %xmm1, %xmm0
994; SSE41-NEXT: movdqa %xmm0, %xmm1
995; SSE41-NEXT: psrld $16, %xmm1
996; SSE41-NEXT: paddb %xmm0, %xmm1
997; SSE41-NEXT: movdqa %xmm1, %xmm0
998; SSE41-NEXT: psrlw $8, %xmm0
999; SSE41-NEXT: paddb %xmm1, %xmm0
1000; SSE41-NEXT: pextrb $0, %xmm0, %eax
1001; SSE41-NEXT: # kill: def $al killed $al killed $eax
1002; SSE41-NEXT: retq
1003;
1004; AVX1-LABEL: test_v32i8:
1005; AVX1: # %bb.0:
1006; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1007; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1008; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1009; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1010; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1011; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1012; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1013; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1014; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1015; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1016; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1017; AVX1-NEXT: # kill: def $al killed $al killed $eax
1018; AVX1-NEXT: vzeroupper
1019; AVX1-NEXT: retq
1020;
1021; AVX2-LABEL: test_v32i8:
1022; AVX2: # %bb.0:
1023; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001024; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001025; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001026; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001027; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001028; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001029; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001030; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001031; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1032; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1033; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1034; AVX2-NEXT: # kill: def $al killed $al killed $eax
1035; AVX2-NEXT: vzeroupper
1036; AVX2-NEXT: retq
1037;
1038; AVX512-LABEL: test_v32i8:
1039; AVX512: # %bb.0:
1040; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001041; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001042; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001043; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001044; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001045; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001046; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001047; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001048; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1049; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1050; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1051; AVX512-NEXT: # kill: def $al killed $al killed $eax
1052; AVX512-NEXT: vzeroupper
1053; AVX512-NEXT: retq
1054 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> %a0)
1055 ret i8 %1
1056}
1057
1058define i8 @test_v64i8(<64 x i8> %a0) {
1059; SSE2-LABEL: test_v64i8:
1060; SSE2: # %bb.0:
1061; SSE2-NEXT: paddb %xmm3, %xmm1
1062; SSE2-NEXT: paddb %xmm2, %xmm1
1063; SSE2-NEXT: paddb %xmm0, %xmm1
1064; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1065; SSE2-NEXT: paddb %xmm1, %xmm0
1066; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1067; SSE2-NEXT: paddb %xmm0, %xmm1
1068; SSE2-NEXT: movdqa %xmm1, %xmm0
1069; SSE2-NEXT: psrld $16, %xmm0
1070; SSE2-NEXT: paddb %xmm1, %xmm0
1071; SSE2-NEXT: movdqa %xmm0, %xmm1
1072; SSE2-NEXT: psrlw $8, %xmm1
1073; SSE2-NEXT: paddb %xmm0, %xmm1
1074; SSE2-NEXT: movd %xmm1, %eax
1075; SSE2-NEXT: # kill: def $al killed $al killed $eax
1076; SSE2-NEXT: retq
1077;
1078; SSE41-LABEL: test_v64i8:
1079; SSE41: # %bb.0:
1080; SSE41-NEXT: paddb %xmm3, %xmm1
1081; SSE41-NEXT: paddb %xmm2, %xmm1
1082; SSE41-NEXT: paddb %xmm0, %xmm1
1083; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1084; SSE41-NEXT: paddb %xmm1, %xmm0
1085; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1086; SSE41-NEXT: paddb %xmm0, %xmm1
1087; SSE41-NEXT: movdqa %xmm1, %xmm0
1088; SSE41-NEXT: psrld $16, %xmm0
1089; SSE41-NEXT: paddb %xmm1, %xmm0
1090; SSE41-NEXT: movdqa %xmm0, %xmm1
1091; SSE41-NEXT: psrlw $8, %xmm1
1092; SSE41-NEXT: paddb %xmm0, %xmm1
1093; SSE41-NEXT: pextrb $0, %xmm1, %eax
1094; SSE41-NEXT: # kill: def $al killed $al killed $eax
1095; SSE41-NEXT: retq
1096;
1097; AVX1-LABEL: test_v64i8:
1098; AVX1: # %bb.0:
1099; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1100; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1101; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
1102; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
1103; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1104; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1105; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1106; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1107; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1108; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1109; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1110; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1111; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1112; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1113; AVX1-NEXT: # kill: def $al killed $al killed $eax
1114; AVX1-NEXT: vzeroupper
1115; AVX1-NEXT: retq
1116;
1117; AVX2-LABEL: test_v64i8:
1118; AVX2: # %bb.0:
1119; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1120; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001121; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001122; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001123; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001124; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001125; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001126; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001127; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001128; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1129; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1130; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1131; AVX2-NEXT: # kill: def $al killed $al killed $eax
1132; AVX2-NEXT: vzeroupper
1133; AVX2-NEXT: retq
1134;
1135; AVX512-LABEL: test_v64i8:
1136; AVX512: # %bb.0:
1137; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1138; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1139; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patela61d5862019-01-29 19:13:39 +00001140; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001141; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patela61d5862019-01-29 19:13:39 +00001142; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001143; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
Sanjay Patela61d5862019-01-29 19:13:39 +00001144; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001145; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
Sanjay Patela61d5862019-01-29 19:13:39 +00001146; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001147; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1148; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1149; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1150; AVX512-NEXT: # kill: def $al killed $al killed $eax
1151; AVX512-NEXT: vzeroupper
1152; AVX512-NEXT: retq
1153 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> %a0)
1154 ret i8 %1
1155}
1156
1157define i8 @test_v128i8(<128 x i8> %a0) {
1158; SSE2-LABEL: test_v128i8:
1159; SSE2: # %bb.0:
1160; SSE2-NEXT: paddb %xmm6, %xmm2
1161; SSE2-NEXT: paddb %xmm7, %xmm3
1162; SSE2-NEXT: paddb %xmm5, %xmm3
1163; SSE2-NEXT: paddb %xmm1, %xmm3
1164; SSE2-NEXT: paddb %xmm4, %xmm2
1165; SSE2-NEXT: paddb %xmm3, %xmm2
1166; SSE2-NEXT: paddb %xmm0, %xmm2
1167; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1168; SSE2-NEXT: paddb %xmm2, %xmm0
1169; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1170; SSE2-NEXT: paddb %xmm0, %xmm1
1171; SSE2-NEXT: movdqa %xmm1, %xmm0
1172; SSE2-NEXT: psrld $16, %xmm0
1173; SSE2-NEXT: paddb %xmm1, %xmm0
1174; SSE2-NEXT: movdqa %xmm0, %xmm1
1175; SSE2-NEXT: psrlw $8, %xmm1
1176; SSE2-NEXT: paddb %xmm0, %xmm1
1177; SSE2-NEXT: movd %xmm1, %eax
1178; SSE2-NEXT: # kill: def $al killed $al killed $eax
1179; SSE2-NEXT: retq
1180;
1181; SSE41-LABEL: test_v128i8:
1182; SSE41: # %bb.0:
1183; SSE41-NEXT: paddb %xmm6, %xmm2
1184; SSE41-NEXT: paddb %xmm7, %xmm3
1185; SSE41-NEXT: paddb %xmm5, %xmm3
1186; SSE41-NEXT: paddb %xmm1, %xmm3
1187; SSE41-NEXT: paddb %xmm4, %xmm2
1188; SSE41-NEXT: paddb %xmm3, %xmm2
1189; SSE41-NEXT: paddb %xmm0, %xmm2
1190; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1191; SSE41-NEXT: paddb %xmm2, %xmm0
1192; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1193; SSE41-NEXT: paddb %xmm0, %xmm1
1194; SSE41-NEXT: movdqa %xmm1, %xmm0
1195; SSE41-NEXT: psrld $16, %xmm0
1196; SSE41-NEXT: paddb %xmm1, %xmm0
1197; SSE41-NEXT: movdqa %xmm0, %xmm1
1198; SSE41-NEXT: psrlw $8, %xmm1
1199; SSE41-NEXT: paddb %xmm0, %xmm1
1200; SSE41-NEXT: pextrb $0, %xmm1, %eax
1201; SSE41-NEXT: # kill: def $al killed $al killed $eax
1202; SSE41-NEXT: retq
1203;
1204; AVX1-LABEL: test_v128i8:
1205; AVX1: # %bb.0:
1206; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm4
1207; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1208; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1209; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
1210; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1211; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
1212; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1213; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
1214; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
1215; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
1216; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1217; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1218; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1219; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1220; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1221; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1222; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1223; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1224; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1225; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1226; AVX1-NEXT: # kill: def $al killed $al killed $eax
1227; AVX1-NEXT: vzeroupper
1228; AVX1-NEXT: retq
1229;
1230; AVX2-LABEL: test_v128i8:
1231; AVX2: # %bb.0:
1232; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1
1233; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
1234; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1235; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001236; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001237; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001238; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001239; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001240; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001241; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
Sanjay Patel21aa6dd2019-01-25 15:37:42 +00001242; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001243; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1244; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1245; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1246; AVX2-NEXT: # kill: def $al killed $al killed $eax
1247; AVX2-NEXT: vzeroupper
1248; AVX2-NEXT: retq
1249;
1250; AVX512-LABEL: test_v128i8:
1251; AVX512: # %bb.0:
1252; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1253; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1254; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1255; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patela61d5862019-01-29 19:13:39 +00001256; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001257; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
Sanjay Patela61d5862019-01-29 19:13:39 +00001258; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001259; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
Sanjay Patela61d5862019-01-29 19:13:39 +00001260; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001261; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
Sanjay Patela61d5862019-01-29 19:13:39 +00001262; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
Craig Topper39910892018-12-05 06:29:44 +00001263; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1264; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1265; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1266; AVX512-NEXT: # kill: def $al killed $al killed $eax
1267; AVX512-NEXT: vzeroupper
1268; AVX512-NEXT: retq
1269 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> %a0)
1270 ret i8 %1
1271}
1272
1273declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
1274declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
1275declare i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64>)
1276declare i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64>)
1277
1278declare i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32>)
1279declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
1280declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
1281declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
1282declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>)
1283
1284declare i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16>)
1285declare i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16>)
1286declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
1287declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
1288declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>)
1289declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>)
1290
1291declare i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8>)
1292declare i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8>)
1293declare i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8>)
1294declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
1295declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
1296declare i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8>)
1297declare i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8>)