blob: 21af6fcde6d7d96794a8915c684374733b387da1 [file] [log] [blame]
Simon Pilgrim7f6f43f2018-04-05 17:37:35 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
8
9;
10; vXi64
11;
12
13define i64 @test_v2i64(<2 x i64> %a0) {
14; SSE-LABEL: test_v2i64:
15; SSE: # %bb.0:
16; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
17; SSE-NEXT: paddq %xmm0, %xmm1
18; SSE-NEXT: movq %xmm1, %rax
19; SSE-NEXT: retq
20;
21; AVX-LABEL: test_v2i64:
22; AVX: # %bb.0:
23; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
24; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
25; AVX-NEXT: vmovq %xmm0, %rax
26; AVX-NEXT: retq
27;
28; AVX512-LABEL: test_v2i64:
29; AVX512: # %bb.0:
30; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
31; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
32; AVX512-NEXT: vmovq %xmm0, %rax
33; AVX512-NEXT: retq
34 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %a0)
35 ret i64 %1
36}
37
38define i64 @test_v4i64(<4 x i64> %a0) {
39; SSE-LABEL: test_v4i64:
40; SSE: # %bb.0:
41; SSE-NEXT: paddq %xmm1, %xmm0
42; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
43; SSE-NEXT: paddq %xmm0, %xmm1
44; SSE-NEXT: movq %xmm1, %rax
45; SSE-NEXT: retq
46;
47; AVX1-LABEL: test_v4i64:
48; AVX1: # %bb.0:
49; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
50; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
51; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
52; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
53; AVX1-NEXT: vmovq %xmm0, %rax
54; AVX1-NEXT: vzeroupper
55; AVX1-NEXT: retq
56;
57; AVX2-LABEL: test_v4i64:
58; AVX2: # %bb.0:
59; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
60; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
61; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
62; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
63; AVX2-NEXT: vmovq %xmm0, %rax
64; AVX2-NEXT: vzeroupper
65; AVX2-NEXT: retq
66;
67; AVX512-LABEL: test_v4i64:
68; AVX512: # %bb.0:
69; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
70; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
71; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
72; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
73; AVX512-NEXT: vmovq %xmm0, %rax
74; AVX512-NEXT: vzeroupper
75; AVX512-NEXT: retq
76 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> %a0)
77 ret i64 %1
78}
79
80define i64 @test_v8i64(<8 x i64> %a0) {
81; SSE-LABEL: test_v8i64:
82; SSE: # %bb.0:
83; SSE-NEXT: paddq %xmm3, %xmm1
84; SSE-NEXT: paddq %xmm2, %xmm1
85; SSE-NEXT: paddq %xmm0, %xmm1
86; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
87; SSE-NEXT: paddq %xmm1, %xmm0
88; SSE-NEXT: movq %xmm0, %rax
89; SSE-NEXT: retq
90;
91; AVX1-LABEL: test_v8i64:
92; AVX1: # %bb.0:
93; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
94; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
95; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
96; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
97; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
98; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
99; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
100; AVX1-NEXT: vmovq %xmm0, %rax
101; AVX1-NEXT: vzeroupper
102; AVX1-NEXT: retq
103;
104; AVX2-LABEL: test_v8i64:
105; AVX2: # %bb.0:
106; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
107; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
108; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
109; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
110; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
111; AVX2-NEXT: vmovq %xmm0, %rax
112; AVX2-NEXT: vzeroupper
113; AVX2-NEXT: retq
114;
115; AVX512-LABEL: test_v8i64:
116; AVX512: # %bb.0:
117; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
118; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
119; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
120; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
121; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
122; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
123; AVX512-NEXT: vmovq %xmm0, %rax
124; AVX512-NEXT: vzeroupper
125; AVX512-NEXT: retq
126 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> %a0)
127 ret i64 %1
128}
129
130define i64 @test_v16i64(<16 x i64> %a0) {
131; SSE-LABEL: test_v16i64:
132; SSE: # %bb.0:
133; SSE-NEXT: paddq %xmm6, %xmm2
134; SSE-NEXT: paddq %xmm7, %xmm3
135; SSE-NEXT: paddq %xmm5, %xmm3
136; SSE-NEXT: paddq %xmm1, %xmm3
137; SSE-NEXT: paddq %xmm4, %xmm2
138; SSE-NEXT: paddq %xmm3, %xmm2
139; SSE-NEXT: paddq %xmm0, %xmm2
140; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
141; SSE-NEXT: paddq %xmm2, %xmm0
142; SSE-NEXT: movq %xmm0, %rax
143; SSE-NEXT: retq
144;
145; AVX1-LABEL: test_v16i64:
146; AVX1: # %bb.0:
147; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4
148; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
149; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
150; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
151; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
152; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
153; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
154; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
155; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
156; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
157; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
158; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
159; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
160; AVX1-NEXT: vmovq %xmm0, %rax
161; AVX1-NEXT: vzeroupper
162; AVX1-NEXT: retq
163;
164; AVX2-LABEL: test_v16i64:
165; AVX2: # %bb.0:
166; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
167; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
168; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
169; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
170; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
171; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
172; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
173; AVX2-NEXT: vmovq %xmm0, %rax
174; AVX2-NEXT: vzeroupper
175; AVX2-NEXT: retq
176;
177; AVX512-LABEL: test_v16i64:
178; AVX512: # %bb.0:
179; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
180; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
181; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
182; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
183; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
184; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
185; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
186; AVX512-NEXT: vmovq %xmm0, %rax
187; AVX512-NEXT: vzeroupper
188; AVX512-NEXT: retq
189 %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> %a0)
190 ret i64 %1
191}
192
193;
194; vXi32
195;
196
197define i32 @test_v4i32(<4 x i32> %a0) {
198; SSE2-LABEL: test_v4i32:
199; SSE2: # %bb.0:
200; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
201; SSE2-NEXT: paddd %xmm0, %xmm1
202; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
203; SSE2-NEXT: paddd %xmm1, %xmm0
204; SSE2-NEXT: movd %xmm0, %eax
205; SSE2-NEXT: retq
206;
207; SSE41-LABEL: test_v4i32:
208; SSE41: # %bb.0:
209; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
210; SSE41-NEXT: paddd %xmm0, %xmm1
211; SSE41-NEXT: phaddd %xmm1, %xmm1
212; SSE41-NEXT: movd %xmm1, %eax
213; SSE41-NEXT: retq
214;
215; AVX-LABEL: test_v4i32:
216; AVX: # %bb.0:
217; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
218; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
219; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
220; AVX-NEXT: vmovd %xmm0, %eax
221; AVX-NEXT: retq
222;
223; AVX512-LABEL: test_v4i32:
224; AVX512: # %bb.0:
225; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
226; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
227; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0
228; AVX512-NEXT: vmovd %xmm0, %eax
229; AVX512-NEXT: retq
230 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a0)
231 ret i32 %1
232}
233
234define i32 @test_v8i32(<8 x i32> %a0) {
235; SSE2-LABEL: test_v8i32:
236; SSE2: # %bb.0:
237; SSE2-NEXT: paddd %xmm1, %xmm0
238; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
239; SSE2-NEXT: paddd %xmm0, %xmm1
240; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
241; SSE2-NEXT: paddd %xmm1, %xmm0
242; SSE2-NEXT: movd %xmm0, %eax
243; SSE2-NEXT: retq
244;
245; SSE41-LABEL: test_v8i32:
246; SSE41: # %bb.0:
247; SSE41-NEXT: paddd %xmm1, %xmm0
248; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
249; SSE41-NEXT: paddd %xmm0, %xmm1
250; SSE41-NEXT: phaddd %xmm1, %xmm1
251; SSE41-NEXT: movd %xmm1, %eax
252; SSE41-NEXT: retq
253;
254; AVX1-LABEL: test_v8i32:
255; AVX1: # %bb.0:
256; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
257; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
258; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
259; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
260; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
261; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
262; AVX1-NEXT: vmovd %xmm0, %eax
263; AVX1-NEXT: vzeroupper
264; AVX1-NEXT: retq
265;
266; AVX2-LABEL: test_v8i32:
267; AVX2: # %bb.0:
268; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
269; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
270; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
271; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
272; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
273; AVX2-NEXT: vmovd %xmm0, %eax
274; AVX2-NEXT: vzeroupper
275; AVX2-NEXT: retq
276;
277; AVX512-LABEL: test_v8i32:
278; AVX512: # %bb.0:
279; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
280; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
281; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
282; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
283; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
284; AVX512-NEXT: vmovd %xmm0, %eax
285; AVX512-NEXT: vzeroupper
286; AVX512-NEXT: retq
287 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %a0)
288 ret i32 %1
289}
290
291define i32 @test_v16i32(<16 x i32> %a0) {
292; SSE2-LABEL: test_v16i32:
293; SSE2: # %bb.0:
294; SSE2-NEXT: paddd %xmm3, %xmm1
295; SSE2-NEXT: paddd %xmm2, %xmm1
296; SSE2-NEXT: paddd %xmm0, %xmm1
297; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
298; SSE2-NEXT: paddd %xmm1, %xmm0
299; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
300; SSE2-NEXT: paddd %xmm0, %xmm1
301; SSE2-NEXT: movd %xmm1, %eax
302; SSE2-NEXT: retq
303;
304; SSE41-LABEL: test_v16i32:
305; SSE41: # %bb.0:
306; SSE41-NEXT: paddd %xmm3, %xmm1
307; SSE41-NEXT: paddd %xmm2, %xmm1
308; SSE41-NEXT: paddd %xmm0, %xmm1
309; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
310; SSE41-NEXT: paddd %xmm1, %xmm0
311; SSE41-NEXT: phaddd %xmm0, %xmm0
312; SSE41-NEXT: movd %xmm0, %eax
313; SSE41-NEXT: retq
314;
315; AVX1-LABEL: test_v16i32:
316; AVX1: # %bb.0:
317; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
318; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
319; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
320; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
321; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
322; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
323; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
324; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
325; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
326; AVX1-NEXT: vmovd %xmm0, %eax
327; AVX1-NEXT: vzeroupper
328; AVX1-NEXT: retq
329;
330; AVX2-LABEL: test_v16i32:
331; AVX2: # %bb.0:
332; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
333; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
334; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
335; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
336; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
337; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
338; AVX2-NEXT: vmovd %xmm0, %eax
339; AVX2-NEXT: vzeroupper
340; AVX2-NEXT: retq
341;
342; AVX512-LABEL: test_v16i32:
343; AVX512: # %bb.0:
344; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
345; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
346; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
347; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
348; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
349; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
350; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
351; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
352; AVX512-NEXT: vmovd %xmm0, %eax
353; AVX512-NEXT: vzeroupper
354; AVX512-NEXT: retq
355 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> %a0)
356 ret i32 %1
357}
358
359define i32 @test_v32i32(<32 x i32> %a0) {
360; SSE2-LABEL: test_v32i32:
361; SSE2: # %bb.0:
362; SSE2-NEXT: paddd %xmm6, %xmm2
363; SSE2-NEXT: paddd %xmm7, %xmm3
364; SSE2-NEXT: paddd %xmm5, %xmm3
365; SSE2-NEXT: paddd %xmm1, %xmm3
366; SSE2-NEXT: paddd %xmm4, %xmm2
367; SSE2-NEXT: paddd %xmm3, %xmm2
368; SSE2-NEXT: paddd %xmm0, %xmm2
369; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
370; SSE2-NEXT: paddd %xmm2, %xmm0
371; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
372; SSE2-NEXT: paddd %xmm0, %xmm1
373; SSE2-NEXT: movd %xmm1, %eax
374; SSE2-NEXT: retq
375;
376; SSE41-LABEL: test_v32i32:
377; SSE41: # %bb.0:
378; SSE41-NEXT: paddd %xmm6, %xmm2
379; SSE41-NEXT: paddd %xmm7, %xmm3
380; SSE41-NEXT: paddd %xmm5, %xmm3
381; SSE41-NEXT: paddd %xmm1, %xmm3
382; SSE41-NEXT: paddd %xmm4, %xmm2
383; SSE41-NEXT: paddd %xmm3, %xmm2
384; SSE41-NEXT: paddd %xmm0, %xmm2
385; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
386; SSE41-NEXT: paddd %xmm2, %xmm0
387; SSE41-NEXT: phaddd %xmm0, %xmm0
388; SSE41-NEXT: movd %xmm0, %eax
389; SSE41-NEXT: retq
390;
391; AVX1-LABEL: test_v32i32:
392; AVX1: # %bb.0:
393; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm4
394; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
395; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
396; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
397; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
398; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
399; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
400; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
401; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
402; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
403; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
404; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
406; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
407; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
408; AVX1-NEXT: vmovd %xmm0, %eax
409; AVX1-NEXT: vzeroupper
410; AVX1-NEXT: retq
411;
412; AVX2-LABEL: test_v32i32:
413; AVX2: # %bb.0:
414; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
415; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
416; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
417; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
418; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
419; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
420; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
421; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
422; AVX2-NEXT: vmovd %xmm0, %eax
423; AVX2-NEXT: vzeroupper
424; AVX2-NEXT: retq
425;
426; AVX512-LABEL: test_v32i32:
427; AVX512: # %bb.0:
428; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
429; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
430; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
431; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
432; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
433; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
434; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
435; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
436; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
437; AVX512-NEXT: vmovd %xmm0, %eax
438; AVX512-NEXT: vzeroupper
439; AVX512-NEXT: retq
440 %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> %a0)
441 ret i32 %1
442}
443
444;
445; vXi16
446;
447
448define i16 @test_v8i16(<8 x i16> %a0) {
449; SSE2-LABEL: test_v8i16:
450; SSE2: # %bb.0:
451; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
452; SSE2-NEXT: paddw %xmm0, %xmm1
453; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
454; SSE2-NEXT: paddw %xmm1, %xmm0
455; SSE2-NEXT: movdqa %xmm0, %xmm1
456; SSE2-NEXT: psrld $16, %xmm1
457; SSE2-NEXT: paddw %xmm0, %xmm1
458; SSE2-NEXT: movd %xmm1, %eax
459; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
460; SSE2-NEXT: retq
461;
462; SSE41-LABEL: test_v8i16:
463; SSE41: # %bb.0:
464; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
465; SSE41-NEXT: paddw %xmm0, %xmm1
466; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
467; SSE41-NEXT: paddw %xmm1, %xmm0
468; SSE41-NEXT: phaddw %xmm0, %xmm0
469; SSE41-NEXT: movd %xmm0, %eax
470; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
471; SSE41-NEXT: retq
472;
473; AVX-LABEL: test_v8i16:
474; AVX: # %bb.0:
475; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
476; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
477; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
478; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
479; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
480; AVX-NEXT: vmovd %xmm0, %eax
481; AVX-NEXT: # kill: def $ax killed $ax killed $eax
482; AVX-NEXT: retq
483;
484; AVX512-LABEL: test_v8i16:
485; AVX512: # %bb.0:
486; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
487; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
488; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
489; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
490; AVX512-NEXT: vphaddw %xmm0, %xmm0, %xmm0
491; AVX512-NEXT: vmovd %xmm0, %eax
492; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
493; AVX512-NEXT: retq
494 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %a0)
495 ret i16 %1
496}
497
498define i16 @test_v16i16(<16 x i16> %a0) {
499; SSE2-LABEL: test_v16i16:
500; SSE2: # %bb.0:
501; SSE2-NEXT: paddw %xmm1, %xmm0
502; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
503; SSE2-NEXT: paddw %xmm0, %xmm1
504; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
505; SSE2-NEXT: paddw %xmm1, %xmm0
506; SSE2-NEXT: movdqa %xmm0, %xmm1
507; SSE2-NEXT: psrld $16, %xmm1
508; SSE2-NEXT: paddw %xmm0, %xmm1
509; SSE2-NEXT: movd %xmm1, %eax
510; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
511; SSE2-NEXT: retq
512;
513; SSE41-LABEL: test_v16i16:
514; SSE41: # %bb.0:
515; SSE41-NEXT: paddw %xmm1, %xmm0
516; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
517; SSE41-NEXT: paddw %xmm0, %xmm1
518; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
519; SSE41-NEXT: paddw %xmm1, %xmm0
520; SSE41-NEXT: phaddw %xmm0, %xmm0
521; SSE41-NEXT: movd %xmm0, %eax
522; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
523; SSE41-NEXT: retq
524;
525; AVX1-LABEL: test_v16i16:
526; AVX1: # %bb.0:
527; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
528; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
529; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
530; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
531; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
532; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
533; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
534; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
535; AVX1-NEXT: vmovd %xmm0, %eax
536; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
537; AVX1-NEXT: vzeroupper
538; AVX1-NEXT: retq
539;
540; AVX2-LABEL: test_v16i16:
541; AVX2: # %bb.0:
542; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
543; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
544; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
545; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
546; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
547; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
548; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0
549; AVX2-NEXT: vmovd %xmm0, %eax
550; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
551; AVX2-NEXT: vzeroupper
552; AVX2-NEXT: retq
553;
554; AVX512-LABEL: test_v16i16:
555; AVX512: # %bb.0:
556; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
557; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0
558; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
559; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0
560; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
561; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0
562; AVX512-NEXT: vphaddw %ymm0, %ymm0, %ymm0
563; AVX512-NEXT: vmovd %xmm0, %eax
564; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
565; AVX512-NEXT: vzeroupper
566; AVX512-NEXT: retq
567 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %a0)
568 ret i16 %1
569}
570
571define i16 @test_v32i16(<32 x i16> %a0) {
572; SSE2-LABEL: test_v32i16:
573; SSE2: # %bb.0:
574; SSE2-NEXT: paddw %xmm3, %xmm1
575; SSE2-NEXT: paddw %xmm2, %xmm1
576; SSE2-NEXT: paddw %xmm0, %xmm1
577; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
578; SSE2-NEXT: paddw %xmm1, %xmm0
579; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
580; SSE2-NEXT: paddw %xmm0, %xmm1
581; SSE2-NEXT: movdqa %xmm1, %xmm0
582; SSE2-NEXT: psrld $16, %xmm0
583; SSE2-NEXT: paddw %xmm1, %xmm0
584; SSE2-NEXT: movd %xmm0, %eax
585; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
586; SSE2-NEXT: retq
587;
588; SSE41-LABEL: test_v32i16:
589; SSE41: # %bb.0:
590; SSE41-NEXT: paddw %xmm3, %xmm1
591; SSE41-NEXT: paddw %xmm2, %xmm1
592; SSE41-NEXT: paddw %xmm0, %xmm1
593; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
594; SSE41-NEXT: paddw %xmm1, %xmm0
595; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
596; SSE41-NEXT: paddw %xmm0, %xmm1
597; SSE41-NEXT: phaddw %xmm1, %xmm1
598; SSE41-NEXT: movd %xmm1, %eax
599; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
600; SSE41-NEXT: retq
601;
602; AVX1-LABEL: test_v32i16:
603; AVX1: # %bb.0:
604; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
605; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
606; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
607; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
608; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
609; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
610; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
611; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
612; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
613; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
614; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
615; AVX1-NEXT: vmovd %xmm0, %eax
616; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
617; AVX1-NEXT: vzeroupper
618; AVX1-NEXT: retq
619;
620; AVX2-LABEL: test_v32i16:
621; AVX2: # %bb.0:
622; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
623; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
624; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
625; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
626; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
627; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
628; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
629; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0
630; AVX2-NEXT: vmovd %xmm0, %eax
631; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
632; AVX2-NEXT: vzeroupper
633; AVX2-NEXT: retq
634;
635; AVX512-LABEL: test_v32i16:
636; AVX512: # %bb.0:
637; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
638; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
639; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
640; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
641; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
642; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
643; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
644; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
645; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
646; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
647; AVX512-NEXT: vmovd %xmm0, %eax
648; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
649; AVX512-NEXT: vzeroupper
650; AVX512-NEXT: retq
651 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> %a0)
652 ret i16 %1
653}
654
655define i16 @test_v64i16(<64 x i16> %a0) {
656; SSE2-LABEL: test_v64i16:
657; SSE2: # %bb.0:
658; SSE2-NEXT: paddw %xmm6, %xmm2
659; SSE2-NEXT: paddw %xmm7, %xmm3
660; SSE2-NEXT: paddw %xmm5, %xmm3
661; SSE2-NEXT: paddw %xmm1, %xmm3
662; SSE2-NEXT: paddw %xmm4, %xmm2
663; SSE2-NEXT: paddw %xmm3, %xmm2
664; SSE2-NEXT: paddw %xmm0, %xmm2
665; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
666; SSE2-NEXT: paddw %xmm2, %xmm0
667; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
668; SSE2-NEXT: paddw %xmm0, %xmm1
669; SSE2-NEXT: movdqa %xmm1, %xmm0
670; SSE2-NEXT: psrld $16, %xmm0
671; SSE2-NEXT: paddw %xmm1, %xmm0
672; SSE2-NEXT: movd %xmm0, %eax
673; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
674; SSE2-NEXT: retq
675;
676; SSE41-LABEL: test_v64i16:
677; SSE41: # %bb.0:
678; SSE41-NEXT: paddw %xmm6, %xmm2
679; SSE41-NEXT: paddw %xmm7, %xmm3
680; SSE41-NEXT: paddw %xmm5, %xmm3
681; SSE41-NEXT: paddw %xmm1, %xmm3
682; SSE41-NEXT: paddw %xmm4, %xmm2
683; SSE41-NEXT: paddw %xmm3, %xmm2
684; SSE41-NEXT: paddw %xmm0, %xmm2
685; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
686; SSE41-NEXT: paddw %xmm2, %xmm0
687; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
688; SSE41-NEXT: paddw %xmm0, %xmm1
689; SSE41-NEXT: phaddw %xmm1, %xmm1
690; SSE41-NEXT: movd %xmm1, %eax
691; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
692; SSE41-NEXT: retq
693;
694; AVX1-LABEL: test_v64i16:
695; AVX1: # %bb.0:
696; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm4
697; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
698; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
699; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
700; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
701; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
702; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
703; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
704; AVX1-NEXT: vpaddw %xmm4, %xmm2, %xmm2
705; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1
706; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
707; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
708; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
709; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
710; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
711; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
712; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
713; AVX1-NEXT: vmovd %xmm0, %eax
714; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
715; AVX1-NEXT: vzeroupper
716; AVX1-NEXT: retq
717;
718; AVX2-LABEL: test_v64i16:
719; AVX2: # %bb.0:
720; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
721; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
722; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
723; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
724; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
725; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
726; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
727; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
728; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
729; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0
730; AVX2-NEXT: vmovd %xmm0, %eax
731; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
732; AVX2-NEXT: vzeroupper
733; AVX2-NEXT: retq
734;
735; AVX512-LABEL: test_v64i16:
736; AVX512: # %bb.0:
737; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
738; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
739; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
740; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
741; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
742; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
743; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
744; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
745; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
746; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
747; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
748; AVX512-NEXT: vmovd %xmm0, %eax
749; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
750; AVX512-NEXT: vzeroupper
751; AVX512-NEXT: retq
752 %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> %a0)
753 ret i16 %1
754}
755
756;
757; vXi8
758;
759
760define i8 @test_v16i8(<16 x i8> %a0) {
761; SSE2-LABEL: test_v16i8:
762; SSE2: # %bb.0:
763; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
764; SSE2-NEXT: paddb %xmm0, %xmm1
765; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
766; SSE2-NEXT: paddb %xmm1, %xmm0
767; SSE2-NEXT: movdqa %xmm0, %xmm1
768; SSE2-NEXT: psrld $16, %xmm1
769; SSE2-NEXT: paddb %xmm0, %xmm1
770; SSE2-NEXT: movdqa %xmm1, %xmm0
771; SSE2-NEXT: psrlw $8, %xmm0
772; SSE2-NEXT: paddb %xmm1, %xmm0
773; SSE2-NEXT: movd %xmm0, %eax
774; SSE2-NEXT: # kill: def $al killed $al killed $eax
775; SSE2-NEXT: retq
776;
777; SSE41-LABEL: test_v16i8:
778; SSE41: # %bb.0:
779; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
780; SSE41-NEXT: paddb %xmm0, %xmm1
781; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
782; SSE41-NEXT: paddb %xmm1, %xmm0
783; SSE41-NEXT: movdqa %xmm0, %xmm1
784; SSE41-NEXT: psrld $16, %xmm1
785; SSE41-NEXT: paddb %xmm0, %xmm1
786; SSE41-NEXT: movdqa %xmm1, %xmm0
787; SSE41-NEXT: psrlw $8, %xmm0
788; SSE41-NEXT: paddb %xmm1, %xmm0
789; SSE41-NEXT: pextrb $0, %xmm0, %eax
790; SSE41-NEXT: # kill: def $al killed $al killed $eax
791; SSE41-NEXT: retq
792;
793; AVX-LABEL: test_v16i8:
794; AVX: # %bb.0:
795; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
796; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
797; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
798; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
799; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
800; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
801; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
802; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
803; AVX-NEXT: vpextrb $0, %xmm0, %eax
804; AVX-NEXT: # kill: def $al killed $al killed $eax
805; AVX-NEXT: retq
806;
807; AVX512-LABEL: test_v16i8:
808; AVX512: # %bb.0:
809; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
810; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
811; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
812; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
813; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
814; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
815; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
816; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
817; AVX512-NEXT: vpextrb $0, %xmm0, %eax
818; AVX512-NEXT: # kill: def $al killed $al killed $eax
819; AVX512-NEXT: retq
820 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %a0)
821 ret i8 %1
822}
823
824define i8 @test_v32i8(<32 x i8> %a0) {
825; SSE2-LABEL: test_v32i8:
826; SSE2: # %bb.0:
827; SSE2-NEXT: paddb %xmm1, %xmm0
828; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
829; SSE2-NEXT: paddb %xmm0, %xmm1
830; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
831; SSE2-NEXT: paddb %xmm1, %xmm0
832; SSE2-NEXT: movdqa %xmm0, %xmm1
833; SSE2-NEXT: psrld $16, %xmm1
834; SSE2-NEXT: paddb %xmm0, %xmm1
835; SSE2-NEXT: movdqa %xmm1, %xmm0
836; SSE2-NEXT: psrlw $8, %xmm0
837; SSE2-NEXT: paddb %xmm1, %xmm0
838; SSE2-NEXT: movd %xmm0, %eax
839; SSE2-NEXT: # kill: def $al killed $al killed $eax
840; SSE2-NEXT: retq
841;
842; SSE41-LABEL: test_v32i8:
843; SSE41: # %bb.0:
844; SSE41-NEXT: paddb %xmm1, %xmm0
845; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
846; SSE41-NEXT: paddb %xmm0, %xmm1
847; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
848; SSE41-NEXT: paddb %xmm1, %xmm0
849; SSE41-NEXT: movdqa %xmm0, %xmm1
850; SSE41-NEXT: psrld $16, %xmm1
851; SSE41-NEXT: paddb %xmm0, %xmm1
852; SSE41-NEXT: movdqa %xmm1, %xmm0
853; SSE41-NEXT: psrlw $8, %xmm0
854; SSE41-NEXT: paddb %xmm1, %xmm0
855; SSE41-NEXT: pextrb $0, %xmm0, %eax
856; SSE41-NEXT: # kill: def $al killed $al killed $eax
857; SSE41-NEXT: retq
858;
859; AVX1-LABEL: test_v32i8:
860; AVX1: # %bb.0:
861; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
862; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
863; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
864; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
865; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
866; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
867; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
868; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
869; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
870; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
871; AVX1-NEXT: vpextrb $0, %xmm0, %eax
872; AVX1-NEXT: # kill: def $al killed $al killed $eax
873; AVX1-NEXT: vzeroupper
874; AVX1-NEXT: retq
875;
876; AVX2-LABEL: test_v32i8:
877; AVX2: # %bb.0:
878; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
879; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
880; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
881; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
882; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
883; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
884; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
885; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
886; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
887; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
888; AVX2-NEXT: vpextrb $0, %xmm0, %eax
889; AVX2-NEXT: # kill: def $al killed $al killed $eax
890; AVX2-NEXT: vzeroupper
891; AVX2-NEXT: retq
892;
893; AVX512-LABEL: test_v32i8:
894; AVX512: # %bb.0:
895; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
896; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
897; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
898; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
899; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
900; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
901; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
902; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
903; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
904; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
905; AVX512-NEXT: vpextrb $0, %xmm0, %eax
906; AVX512-NEXT: # kill: def $al killed $al killed $eax
907; AVX512-NEXT: vzeroupper
908; AVX512-NEXT: retq
909 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> %a0)
910 ret i8 %1
911}
912
913define i8 @test_v64i8(<64 x i8> %a0) {
914; SSE2-LABEL: test_v64i8:
915; SSE2: # %bb.0:
916; SSE2-NEXT: paddb %xmm3, %xmm1
917; SSE2-NEXT: paddb %xmm2, %xmm1
918; SSE2-NEXT: paddb %xmm0, %xmm1
919; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
920; SSE2-NEXT: paddb %xmm1, %xmm0
921; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
922; SSE2-NEXT: paddb %xmm0, %xmm1
923; SSE2-NEXT: movdqa %xmm1, %xmm0
924; SSE2-NEXT: psrld $16, %xmm0
925; SSE2-NEXT: paddb %xmm1, %xmm0
926; SSE2-NEXT: movdqa %xmm0, %xmm1
927; SSE2-NEXT: psrlw $8, %xmm1
928; SSE2-NEXT: paddb %xmm0, %xmm1
929; SSE2-NEXT: movd %xmm1, %eax
930; SSE2-NEXT: # kill: def $al killed $al killed $eax
931; SSE2-NEXT: retq
932;
933; SSE41-LABEL: test_v64i8:
934; SSE41: # %bb.0:
935; SSE41-NEXT: paddb %xmm3, %xmm1
936; SSE41-NEXT: paddb %xmm2, %xmm1
937; SSE41-NEXT: paddb %xmm0, %xmm1
938; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
939; SSE41-NEXT: paddb %xmm1, %xmm0
940; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
941; SSE41-NEXT: paddb %xmm0, %xmm1
942; SSE41-NEXT: movdqa %xmm1, %xmm0
943; SSE41-NEXT: psrld $16, %xmm0
944; SSE41-NEXT: paddb %xmm1, %xmm0
945; SSE41-NEXT: movdqa %xmm0, %xmm1
946; SSE41-NEXT: psrlw $8, %xmm1
947; SSE41-NEXT: paddb %xmm0, %xmm1
948; SSE41-NEXT: pextrb $0, %xmm1, %eax
949; SSE41-NEXT: # kill: def $al killed $al killed $eax
950; SSE41-NEXT: retq
951;
952; AVX1-LABEL: test_v64i8:
953; AVX1: # %bb.0:
954; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
955; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
956; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
957; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
958; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
959; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
960; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
961; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
962; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
963; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
964; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
965; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
966; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
967; AVX1-NEXT: vpextrb $0, %xmm0, %eax
968; AVX1-NEXT: # kill: def $al killed $al killed $eax
969; AVX1-NEXT: vzeroupper
970; AVX1-NEXT: retq
971;
972; AVX2-LABEL: test_v64i8:
973; AVX2: # %bb.0:
974; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
975; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
976; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
977; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
978; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
979; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
980; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
981; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
982; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
983; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
984; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
985; AVX2-NEXT: vpextrb $0, %xmm0, %eax
986; AVX2-NEXT: # kill: def $al killed $al killed $eax
987; AVX2-NEXT: vzeroupper
988; AVX2-NEXT: retq
989;
990; AVX512-LABEL: test_v64i8:
991; AVX512: # %bb.0:
992; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
993; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
994; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
995; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
996; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
997; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
998; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
999; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1000; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1001; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1002; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1003; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1004; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1005; AVX512-NEXT: # kill: def $al killed $al killed $eax
1006; AVX512-NEXT: vzeroupper
1007; AVX512-NEXT: retq
1008 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> %a0)
1009 ret i8 %1
1010}
1011
1012define i8 @test_v128i8(<128 x i8> %a0) {
1013; SSE2-LABEL: test_v128i8:
1014; SSE2: # %bb.0:
1015; SSE2-NEXT: paddb %xmm6, %xmm2
1016; SSE2-NEXT: paddb %xmm7, %xmm3
1017; SSE2-NEXT: paddb %xmm5, %xmm3
1018; SSE2-NEXT: paddb %xmm1, %xmm3
1019; SSE2-NEXT: paddb %xmm4, %xmm2
1020; SSE2-NEXT: paddb %xmm3, %xmm2
1021; SSE2-NEXT: paddb %xmm0, %xmm2
1022; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1023; SSE2-NEXT: paddb %xmm2, %xmm0
1024; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1025; SSE2-NEXT: paddb %xmm0, %xmm1
1026; SSE2-NEXT: movdqa %xmm1, %xmm0
1027; SSE2-NEXT: psrld $16, %xmm0
1028; SSE2-NEXT: paddb %xmm1, %xmm0
1029; SSE2-NEXT: movdqa %xmm0, %xmm1
1030; SSE2-NEXT: psrlw $8, %xmm1
1031; SSE2-NEXT: paddb %xmm0, %xmm1
1032; SSE2-NEXT: movd %xmm1, %eax
1033; SSE2-NEXT: # kill: def $al killed $al killed $eax
1034; SSE2-NEXT: retq
1035;
1036; SSE41-LABEL: test_v128i8:
1037; SSE41: # %bb.0:
1038; SSE41-NEXT: paddb %xmm6, %xmm2
1039; SSE41-NEXT: paddb %xmm7, %xmm3
1040; SSE41-NEXT: paddb %xmm5, %xmm3
1041; SSE41-NEXT: paddb %xmm1, %xmm3
1042; SSE41-NEXT: paddb %xmm4, %xmm2
1043; SSE41-NEXT: paddb %xmm3, %xmm2
1044; SSE41-NEXT: paddb %xmm0, %xmm2
1045; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1046; SSE41-NEXT: paddb %xmm2, %xmm0
1047; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1048; SSE41-NEXT: paddb %xmm0, %xmm1
1049; SSE41-NEXT: movdqa %xmm1, %xmm0
1050; SSE41-NEXT: psrld $16, %xmm0
1051; SSE41-NEXT: paddb %xmm1, %xmm0
1052; SSE41-NEXT: movdqa %xmm0, %xmm1
1053; SSE41-NEXT: psrlw $8, %xmm1
1054; SSE41-NEXT: paddb %xmm0, %xmm1
1055; SSE41-NEXT: pextrb $0, %xmm1, %eax
1056; SSE41-NEXT: # kill: def $al killed $al killed $eax
1057; SSE41-NEXT: retq
1058;
1059; AVX1-LABEL: test_v128i8:
1060; AVX1: # %bb.0:
1061; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm4
1062; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1063; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1064; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
1065; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1066; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
1067; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1068; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
1069; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
1070; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
1071; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1072; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1073; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1074; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1075; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1076; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1077; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1078; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1079; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1080; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1081; AVX1-NEXT: # kill: def $al killed $al killed $eax
1082; AVX1-NEXT: vzeroupper
1083; AVX1-NEXT: retq
1084;
1085; AVX2-LABEL: test_v128i8:
1086; AVX2: # %bb.0:
1087; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1
1088; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
1089; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1090; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1091; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1092; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1093; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1094; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1095; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1096; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1097; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1098; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1099; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1100; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1101; AVX2-NEXT: # kill: def $al killed $al killed $eax
1102; AVX2-NEXT: vzeroupper
1103; AVX2-NEXT: retq
1104;
1105; AVX512-LABEL: test_v128i8:
1106; AVX512: # %bb.0:
1107; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1108; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1109; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1110; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1111; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1112; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1113; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1114; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1115; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1116; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1117; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1118; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1119; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1120; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1121; AVX512-NEXT: # kill: def $al killed $al killed $eax
1122; AVX512-NEXT: vzeroupper
1123; AVX512-NEXT: retq
1124 %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> %a0)
1125 ret i8 %1
1126}
1127
1128declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
1129declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
1130declare i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64>)
1131declare i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64>)
1132
1133declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
1134declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
1135declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
1136declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>)
1137
1138declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
1139declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
1140declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>)
1141declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>)
1142
1143declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
1144declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
1145declare i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8>)
1146declare i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8>)