blob: 740db28bb08bf7edd79e0d70bae666806cee98e3 [file] [log] [blame]
Sanjay Patelfd58d622019-01-03 22:26:51 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
Sanjay Patelac23c462019-01-03 22:42:32 +00008; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
Sanjay Patelfd58d622019-01-03 22:26:51 +000010
11; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
12
13define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
14; SSE-LABEL: test14_undef:
15; SSE: # %bb.0:
16; SSE-NEXT: phaddd %xmm2, %xmm0
17; SSE-NEXT: retq
18;
19; AVX1-LABEL: test14_undef:
20; AVX1: # %bb.0:
21; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
22; AVX1-NEXT: retq
23;
24; AVX2-LABEL: test14_undef:
25; AVX2: # %bb.0:
26; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
27; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +000028;
29; AVX512-LABEL: test14_undef:
30; AVX512: # %bb.0:
31; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
32; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +000033 %vecext = extractelement <8 x i32> %a, i32 0
34 %vecext1 = extractelement <8 x i32> %a, i32 1
35 %add = add i32 %vecext, %vecext1
36 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
37 %vecext2 = extractelement <8 x i32> %b, i32 2
38 %vecext3 = extractelement <8 x i32> %b, i32 3
39 %add4 = add i32 %vecext2, %vecext3
40 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3
41 ret <8 x i32> %vecinit5
42}
43
44; integer horizontal adds instead of two scalar adds followed by vector inserts.
45define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
46; SSE-LABEL: test15_undef:
47; SSE: # %bb.0:
48; SSE-NEXT: movd %xmm0, %eax
49; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
50; SSE-NEXT: movd %xmm0, %ecx
51; SSE-NEXT: addl %eax, %ecx
52; SSE-NEXT: movd %xmm3, %eax
53; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
54; SSE-NEXT: movd %xmm0, %edx
55; SSE-NEXT: addl %eax, %edx
56; SSE-NEXT: movd %ecx, %xmm0
57; SSE-NEXT: movd %edx, %xmm1
58; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
59; SSE-NEXT: retq
60;
61; AVX1-LABEL: test15_undef:
62; AVX1: # %bb.0:
63; AVX1-NEXT: vmovd %xmm0, %eax
64; AVX1-NEXT: vpextrd $1, %xmm0, %ecx
65; AVX1-NEXT: addl %eax, %ecx
66; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
67; AVX1-NEXT: vmovd %xmm0, %eax
68; AVX1-NEXT: vpextrd $1, %xmm0, %edx
69; AVX1-NEXT: addl %eax, %edx
70; AVX1-NEXT: vmovd %ecx, %xmm0
71; AVX1-NEXT: vmovd %edx, %xmm1
72; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
73; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
74; AVX1-NEXT: retq
75;
76; AVX2-LABEL: test15_undef:
77; AVX2: # %bb.0:
78; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
79; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +000080;
81; AVX512-LABEL: test15_undef:
82; AVX512: # %bb.0:
83; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
84; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +000085 %vecext = extractelement <8 x i32> %a, i32 0
86 %vecext1 = extractelement <8 x i32> %a, i32 1
87 %add = add i32 %vecext, %vecext1
88 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
89 %vecext2 = extractelement <8 x i32> %b, i32 4
90 %vecext3 = extractelement <8 x i32> %b, i32 5
91 %add4 = add i32 %vecext2, %vecext3
92 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6
93 ret <8 x i32> %vecinit5
94}
95
96define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
97; SSE-LABEL: test16_undef:
98; SSE: # %bb.0:
99; SSE-NEXT: phaddd %xmm0, %xmm0
100; SSE-NEXT: retq
101;
102; AVX1-LABEL: test16_undef:
103; AVX1: # %bb.0:
104; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
105; AVX1-NEXT: retq
106;
107; AVX2-LABEL: test16_undef:
108; AVX2: # %bb.0:
109; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
110; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +0000111;
112; AVX512-LABEL: test16_undef:
113; AVX512: # %bb.0:
114; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
115; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +0000116 %vecext = extractelement <8 x i32> %a, i32 0
117 %vecext1 = extractelement <8 x i32> %a, i32 1
118 %add = add i32 %vecext, %vecext1
119 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
120 %vecext2 = extractelement <8 x i32> %a, i32 2
121 %vecext3 = extractelement <8 x i32> %a, i32 3
122 %add4 = add i32 %vecext2, %vecext3
123 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
124 ret <8 x i32> %vecinit5
125}
126
127define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
128; SSE-LABEL: test17_undef:
129; SSE: # %bb.0:
130; SSE-NEXT: phaddd %xmm1, %xmm0
131; SSE-NEXT: retq
132;
133; AVX1-LABEL: test17_undef:
134; AVX1: # %bb.0:
135; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
136; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
137; AVX1-NEXT: retq
138;
139; AVX2-LABEL: test17_undef:
140; AVX2: # %bb.0:
141; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
142; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
143; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +0000144;
145; AVX512-LABEL: test17_undef:
146; AVX512: # %bb.0:
147; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
148; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
149; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +0000150 %vecext = extractelement <8 x i32> %a, i32 0
151 %vecext1 = extractelement <8 x i32> %a, i32 1
152 %add1 = add i32 %vecext, %vecext1
153 %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0
154 %vecext2 = extractelement <8 x i32> %a, i32 2
155 %vecext3 = extractelement <8 x i32> %a, i32 3
156 %add2 = add i32 %vecext2, %vecext3
157 %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1
158 %vecext4 = extractelement <8 x i32> %a, i32 4
159 %vecext5 = extractelement <8 x i32> %a, i32 5
160 %add3 = add i32 %vecext4, %vecext5
161 %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2
162 %vecext6 = extractelement <8 x i32> %a, i32 6
163 %vecext7 = extractelement <8 x i32> %a, i32 7
164 %add4 = add i32 %vecext6, %vecext7
165 %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3
166 ret <8 x i32> %vecinit4
167}
168