blob: c827665400ac797a514ffb81cf9a1fb322cbe2da [file] [log] [blame]
Sanjay Patelfd58d622019-01-03 22:26:51 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
Sanjay Patelac23c462019-01-03 22:42:32 +00008; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
Sanjay Patelfd58d622019-01-03 22:26:51 +000010
11; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
12
13define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
14; SSE-LABEL: test14_undef:
15; SSE: # %bb.0:
16; SSE-NEXT: phaddd %xmm2, %xmm0
17; SSE-NEXT: retq
18;
19; AVX1-LABEL: test14_undef:
20; AVX1: # %bb.0:
21; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
22; AVX1-NEXT: retq
23;
24; AVX2-LABEL: test14_undef:
25; AVX2: # %bb.0:
26; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
27; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +000028;
29; AVX512-LABEL: test14_undef:
30; AVX512: # %bb.0:
31; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
32; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +000033 %vecext = extractelement <8 x i32> %a, i32 0
34 %vecext1 = extractelement <8 x i32> %a, i32 1
35 %add = add i32 %vecext, %vecext1
36 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
37 %vecext2 = extractelement <8 x i32> %b, i32 2
38 %vecext3 = extractelement <8 x i32> %b, i32 3
39 %add4 = add i32 %vecext2, %vecext3
40 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3
41 ret <8 x i32> %vecinit5
42}
43
44; integer horizontal adds instead of two scalar adds followed by vector inserts.
45define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
46; SSE-LABEL: test15_undef:
47; SSE: # %bb.0:
48; SSE-NEXT: movd %xmm0, %eax
49; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
50; SSE-NEXT: movd %xmm0, %ecx
51; SSE-NEXT: addl %eax, %ecx
52; SSE-NEXT: movd %xmm3, %eax
53; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
54; SSE-NEXT: movd %xmm0, %edx
55; SSE-NEXT: addl %eax, %edx
56; SSE-NEXT: movd %ecx, %xmm0
57; SSE-NEXT: movd %edx, %xmm1
58; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
59; SSE-NEXT: retq
60;
61; AVX1-LABEL: test15_undef:
62; AVX1: # %bb.0:
63; AVX1-NEXT: vmovd %xmm0, %eax
64; AVX1-NEXT: vpextrd $1, %xmm0, %ecx
65; AVX1-NEXT: addl %eax, %ecx
66; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
67; AVX1-NEXT: vmovd %xmm0, %eax
68; AVX1-NEXT: vpextrd $1, %xmm0, %edx
69; AVX1-NEXT: addl %eax, %edx
70; AVX1-NEXT: vmovd %ecx, %xmm0
71; AVX1-NEXT: vmovd %edx, %xmm1
72; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
73; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
74; AVX1-NEXT: retq
75;
76; AVX2-LABEL: test15_undef:
77; AVX2: # %bb.0:
Sanjay Pateled5cfc62019-01-10 15:04:52 +000078; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
Sanjay Patelfd58d622019-01-03 22:26:51 +000079; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +000080;
81; AVX512-LABEL: test15_undef:
82; AVX512: # %bb.0:
Sanjay Pateled5cfc62019-01-10 15:04:52 +000083; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
Sanjay Patelac23c462019-01-03 22:42:32 +000084; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +000085 %vecext = extractelement <8 x i32> %a, i32 0
86 %vecext1 = extractelement <8 x i32> %a, i32 1
87 %add = add i32 %vecext, %vecext1
88 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
89 %vecext2 = extractelement <8 x i32> %b, i32 4
90 %vecext3 = extractelement <8 x i32> %b, i32 5
91 %add4 = add i32 %vecext2, %vecext3
92 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6
93 ret <8 x i32> %vecinit5
94}
95
Sanjay Patel6a18c532019-01-08 19:15:21 +000096define <8 x i32> @PR40243_alt(<8 x i32> %a, <8 x i32> %b) {
97; SSE-LABEL: PR40243_alt:
98; SSE: # %bb.0:
99; SSE-NEXT: phaddd %xmm3, %xmm1
100; SSE-NEXT: retq
101;
102; AVX1-LABEL: PR40243_alt:
103; AVX1: # %bb.0:
104; AVX1-NEXT: retq
105;
106; AVX2-LABEL: PR40243_alt:
107; AVX2: # %bb.0:
Sanjay Pateled5cfc62019-01-10 15:04:52 +0000108; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
Sanjay Patel6a18c532019-01-08 19:15:21 +0000109; AVX2-NEXT: retq
110;
111; AVX512-LABEL: PR40243_alt:
112; AVX512: # %bb.0:
Sanjay Pateled5cfc62019-01-10 15:04:52 +0000113; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
Sanjay Patel6a18c532019-01-08 19:15:21 +0000114; AVX512-NEXT: retq
115 %a4 = extractelement <8 x i32> %a, i32 4
116 %a5 = extractelement <8 x i32> %a, i32 5
117 %add4 = add i32 %a4, %a5
118 %b6 = extractelement <8 x i32> %b, i32 6
119 %b7 = extractelement <8 x i32> %b, i32 7
120 %add7 = add i32 %b6, %b7
121 %r4 = insertelement <8 x i32> undef, i32 %add4, i32 4
122 %r = insertelement <8 x i32> %r4, i32 %add7, i32 7
123 ret <8 x i32> %r
124}
125
Sanjay Patelfd58d622019-01-03 22:26:51 +0000126define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
127; SSE-LABEL: test16_undef:
128; SSE: # %bb.0:
129; SSE-NEXT: phaddd %xmm0, %xmm0
130; SSE-NEXT: retq
131;
132; AVX1-LABEL: test16_undef:
133; AVX1: # %bb.0:
134; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
135; AVX1-NEXT: retq
136;
137; AVX2-LABEL: test16_undef:
138; AVX2: # %bb.0:
139; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
140; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +0000141;
142; AVX512-LABEL: test16_undef:
143; AVX512: # %bb.0:
144; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
145; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +0000146 %vecext = extractelement <8 x i32> %a, i32 0
147 %vecext1 = extractelement <8 x i32> %a, i32 1
148 %add = add i32 %vecext, %vecext1
149 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
150 %vecext2 = extractelement <8 x i32> %a, i32 2
151 %vecext3 = extractelement <8 x i32> %a, i32 3
152 %add4 = add i32 %vecext2, %vecext3
153 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
154 ret <8 x i32> %vecinit5
155}
156
Sanjay Patelb8687c22019-01-03 22:55:18 +0000157define <16 x i32> @test16_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
158; SSE-LABEL: test16_v16i32_undef:
159; SSE: # %bb.0:
160; SSE-NEXT: phaddd %xmm0, %xmm0
161; SSE-NEXT: retq
162;
163; AVX1-LABEL: test16_v16i32_undef:
164; AVX1: # %bb.0:
165; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
166; AVX1-NEXT: retq
167;
168; AVX2-LABEL: test16_v16i32_undef:
169; AVX2: # %bb.0:
170; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
171; AVX2-NEXT: retq
172;
173; AVX512-LABEL: test16_v16i32_undef:
174; AVX512: # %bb.0:
175; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0
176; AVX512-NEXT: retq
177 %vecext = extractelement <16 x i32> %a, i32 0
178 %vecext1 = extractelement <16 x i32> %a, i32 1
179 %add = add i32 %vecext, %vecext1
180 %vecinit = insertelement <16 x i32> undef, i32 %add, i32 0
181 %vecext2 = extractelement <16 x i32> %a, i32 2
182 %vecext3 = extractelement <16 x i32> %a, i32 3
183 %add4 = add i32 %vecext2, %vecext3
184 %vecinit5 = insertelement <16 x i32> %vecinit, i32 %add4, i32 1
185 ret <16 x i32> %vecinit5
186}
187
Sanjay Patelfd58d622019-01-03 22:26:51 +0000188define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
189; SSE-LABEL: test17_undef:
190; SSE: # %bb.0:
191; SSE-NEXT: phaddd %xmm1, %xmm0
192; SSE-NEXT: retq
193;
194; AVX1-LABEL: test17_undef:
195; AVX1: # %bb.0:
196; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
197; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
198; AVX1-NEXT: retq
199;
200; AVX2-LABEL: test17_undef:
201; AVX2: # %bb.0:
202; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
203; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
204; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +0000205;
206; AVX512-LABEL: test17_undef:
207; AVX512: # %bb.0:
208; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
209; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
210; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +0000211 %vecext = extractelement <8 x i32> %a, i32 0
212 %vecext1 = extractelement <8 x i32> %a, i32 1
213 %add1 = add i32 %vecext, %vecext1
214 %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0
215 %vecext2 = extractelement <8 x i32> %a, i32 2
216 %vecext3 = extractelement <8 x i32> %a, i32 3
217 %add2 = add i32 %vecext2, %vecext3
218 %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1
219 %vecext4 = extractelement <8 x i32> %a, i32 4
220 %vecext5 = extractelement <8 x i32> %a, i32 5
221 %add3 = add i32 %vecext4, %vecext5
222 %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2
223 %vecext6 = extractelement <8 x i32> %a, i32 6
224 %vecext7 = extractelement <8 x i32> %a, i32 7
225 %add4 = add i32 %vecext6, %vecext7
226 %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3
227 ret <8 x i32> %vecinit4
228}
229
Sanjay Patelb8687c22019-01-03 22:55:18 +0000230define <16 x i32> @test17_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
231; SSE-LABEL: test17_v16i32_undef:
232; SSE: # %bb.0:
233; SSE-NEXT: phaddd %xmm1, %xmm0
234; SSE-NEXT: retq
235;
236; AVX1-LABEL: test17_v16i32_undef:
237; AVX1: # %bb.0:
238; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
239; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
240; AVX1-NEXT: retq
241;
242; AVX2-LABEL: test17_v16i32_undef:
243; AVX2: # %bb.0:
244; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
245; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
246; AVX2-NEXT: retq
247;
248; AVX512-LABEL: test17_v16i32_undef:
249; AVX512: # %bb.0:
250; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
251; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
252; AVX512-NEXT: retq
253 %vecext = extractelement <16 x i32> %a, i32 0
254 %vecext1 = extractelement <16 x i32> %a, i32 1
255 %add1 = add i32 %vecext, %vecext1
256 %vecinit1 = insertelement <16 x i32> undef, i32 %add1, i32 0
257 %vecext2 = extractelement <16 x i32> %a, i32 2
258 %vecext3 = extractelement <16 x i32> %a, i32 3
259 %add2 = add i32 %vecext2, %vecext3
260 %vecinit2 = insertelement <16 x i32> %vecinit1, i32 %add2, i32 1
261 %vecext4 = extractelement <16 x i32> %a, i32 4
262 %vecext5 = extractelement <16 x i32> %a, i32 5
263 %add3 = add i32 %vecext4, %vecext5
264 %vecinit3 = insertelement <16 x i32> %vecinit2, i32 %add3, i32 2
265 %vecext6 = extractelement <16 x i32> %a, i32 6
266 %vecext7 = extractelement <16 x i32> %a, i32 7
267 %add4 = add i32 %vecext6, %vecext7
268 %vecinit4 = insertelement <16 x i32> %vecinit3, i32 %add4, i32 3
269 ret <16 x i32> %vecinit4
270}
271