blob: 6fffbefa4274a27f76a672711488c3664102ef49 [file] [log] [blame]
Sanjay Patelfd58d622019-01-03 22:26:51 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
Sanjay Patelac23c462019-01-03 22:42:32 +00008; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
Sanjay Patelfd58d622019-01-03 22:26:51 +000010
11; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
12
13define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
14; SSE-LABEL: test14_undef:
15; SSE: # %bb.0:
16; SSE-NEXT: phaddd %xmm2, %xmm0
17; SSE-NEXT: retq
18;
19; AVX1-LABEL: test14_undef:
20; AVX1: # %bb.0:
21; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
22; AVX1-NEXT: retq
23;
24; AVX2-LABEL: test14_undef:
25; AVX2: # %bb.0:
26; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
27; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +000028;
29; AVX512-LABEL: test14_undef:
30; AVX512: # %bb.0:
31; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
32; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +000033 %vecext = extractelement <8 x i32> %a, i32 0
34 %vecext1 = extractelement <8 x i32> %a, i32 1
35 %add = add i32 %vecext, %vecext1
36 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
37 %vecext2 = extractelement <8 x i32> %b, i32 2
38 %vecext3 = extractelement <8 x i32> %b, i32 3
39 %add4 = add i32 %vecext2, %vecext3
40 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3
41 ret <8 x i32> %vecinit5
42}
43
44; integer horizontal adds instead of two scalar adds followed by vector inserts.
45define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
Sanjay Patelb23ff7a2019-01-14 18:44:02 +000046; SSE-SLOW-LABEL: test15_undef:
47; SSE-SLOW: # %bb.0:
48; SSE-SLOW-NEXT: movd %xmm0, %eax
49; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
50; SSE-SLOW-NEXT: movd %xmm0, %ecx
51; SSE-SLOW-NEXT: addl %eax, %ecx
52; SSE-SLOW-NEXT: movd %xmm3, %eax
53; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
54; SSE-SLOW-NEXT: movd %xmm0, %edx
55; SSE-SLOW-NEXT: addl %eax, %edx
56; SSE-SLOW-NEXT: movd %ecx, %xmm0
57; SSE-SLOW-NEXT: movd %edx, %xmm1
58; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
59; SSE-SLOW-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +000060;
Sanjay Patelb23ff7a2019-01-14 18:44:02 +000061; SSE-FAST-LABEL: test15_undef:
62; SSE-FAST: # %bb.0:
63; SSE-FAST-NEXT: phaddd %xmm0, %xmm0
64; SSE-FAST-NEXT: phaddd %xmm3, %xmm3
65; SSE-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1]
66; SSE-FAST-NEXT: retq
67;
68; AVX1-SLOW-LABEL: test15_undef:
69; AVX1-SLOW: # %bb.0:
70; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
71; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %ecx
72; AVX1-SLOW-NEXT: addl %eax, %ecx
73; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
74; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
75; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %edx
76; AVX1-SLOW-NEXT: addl %eax, %edx
77; AVX1-SLOW-NEXT: vmovd %ecx, %xmm0
78; AVX1-SLOW-NEXT: vmovd %edx, %xmm1
79; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
80; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
81; AVX1-SLOW-NEXT: retq
82;
83; AVX1-FAST-LABEL: test15_undef:
84; AVX1-FAST: # %bb.0:
85; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
86; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
87; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1
88; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
89; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
90; AVX1-FAST-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +000091;
92; AVX2-LABEL: test15_undef:
93; AVX2: # %bb.0:
Sanjay Pateled5cfc62019-01-10 15:04:52 +000094; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
Sanjay Patelfd58d622019-01-03 22:26:51 +000095; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +000096;
97; AVX512-LABEL: test15_undef:
98; AVX512: # %bb.0:
Sanjay Pateled5cfc62019-01-10 15:04:52 +000099; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
Sanjay Patelac23c462019-01-03 22:42:32 +0000100; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +0000101 %vecext = extractelement <8 x i32> %a, i32 0
102 %vecext1 = extractelement <8 x i32> %a, i32 1
103 %add = add i32 %vecext, %vecext1
104 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
105 %vecext2 = extractelement <8 x i32> %b, i32 4
106 %vecext3 = extractelement <8 x i32> %b, i32 5
107 %add4 = add i32 %vecext2, %vecext3
108 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6
109 ret <8 x i32> %vecinit5
110}
111
Sanjay Patel6a18c532019-01-08 19:15:21 +0000112define <8 x i32> @PR40243_alt(<8 x i32> %a, <8 x i32> %b) {
113; SSE-LABEL: PR40243_alt:
114; SSE: # %bb.0:
115; SSE-NEXT: phaddd %xmm3, %xmm1
116; SSE-NEXT: retq
117;
118; AVX1-LABEL: PR40243_alt:
119; AVX1: # %bb.0:
Sanjay Patel87ae1462019-01-10 15:27:23 +0000120; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
121; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
122; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
123; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
Sanjay Patel6a18c532019-01-08 19:15:21 +0000124; AVX1-NEXT: retq
125;
126; AVX2-LABEL: PR40243_alt:
127; AVX2: # %bb.0:
Sanjay Pateled5cfc62019-01-10 15:04:52 +0000128; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
Sanjay Patel6a18c532019-01-08 19:15:21 +0000129; AVX2-NEXT: retq
130;
131; AVX512-LABEL: PR40243_alt:
132; AVX512: # %bb.0:
Sanjay Pateled5cfc62019-01-10 15:04:52 +0000133; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
Sanjay Patel6a18c532019-01-08 19:15:21 +0000134; AVX512-NEXT: retq
135 %a4 = extractelement <8 x i32> %a, i32 4
136 %a5 = extractelement <8 x i32> %a, i32 5
137 %add4 = add i32 %a4, %a5
138 %b6 = extractelement <8 x i32> %b, i32 6
139 %b7 = extractelement <8 x i32> %b, i32 7
140 %add7 = add i32 %b6, %b7
141 %r4 = insertelement <8 x i32> undef, i32 %add4, i32 4
142 %r = insertelement <8 x i32> %r4, i32 %add7, i32 7
143 ret <8 x i32> %r
144}
145
Sanjay Patelfd58d622019-01-03 22:26:51 +0000146define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
147; SSE-LABEL: test16_undef:
148; SSE: # %bb.0:
149; SSE-NEXT: phaddd %xmm0, %xmm0
150; SSE-NEXT: retq
151;
152; AVX1-LABEL: test16_undef:
153; AVX1: # %bb.0:
154; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
155; AVX1-NEXT: retq
156;
157; AVX2-LABEL: test16_undef:
158; AVX2: # %bb.0:
159; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
160; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +0000161;
162; AVX512-LABEL: test16_undef:
163; AVX512: # %bb.0:
164; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
165; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +0000166 %vecext = extractelement <8 x i32> %a, i32 0
167 %vecext1 = extractelement <8 x i32> %a, i32 1
168 %add = add i32 %vecext, %vecext1
169 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
170 %vecext2 = extractelement <8 x i32> %a, i32 2
171 %vecext3 = extractelement <8 x i32> %a, i32 3
172 %add4 = add i32 %vecext2, %vecext3
173 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
174 ret <8 x i32> %vecinit5
175}
176
Sanjay Patelb8687c22019-01-03 22:55:18 +0000177define <16 x i32> @test16_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
178; SSE-LABEL: test16_v16i32_undef:
179; SSE: # %bb.0:
180; SSE-NEXT: phaddd %xmm0, %xmm0
181; SSE-NEXT: retq
182;
183; AVX1-LABEL: test16_v16i32_undef:
184; AVX1: # %bb.0:
185; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
186; AVX1-NEXT: retq
187;
188; AVX2-LABEL: test16_v16i32_undef:
189; AVX2: # %bb.0:
190; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
191; AVX2-NEXT: retq
192;
193; AVX512-LABEL: test16_v16i32_undef:
194; AVX512: # %bb.0:
Sanjay Patel40cd4b72019-01-11 14:27:59 +0000195; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
Sanjay Patelb8687c22019-01-03 22:55:18 +0000196; AVX512-NEXT: retq
197 %vecext = extractelement <16 x i32> %a, i32 0
198 %vecext1 = extractelement <16 x i32> %a, i32 1
199 %add = add i32 %vecext, %vecext1
200 %vecinit = insertelement <16 x i32> undef, i32 %add, i32 0
201 %vecext2 = extractelement <16 x i32> %a, i32 2
202 %vecext3 = extractelement <16 x i32> %a, i32 3
203 %add4 = add i32 %vecext2, %vecext3
204 %vecinit5 = insertelement <16 x i32> %vecinit, i32 %add4, i32 1
205 ret <16 x i32> %vecinit5
206}
207
Sanjay Patelfd58d622019-01-03 22:26:51 +0000208define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
209; SSE-LABEL: test17_undef:
210; SSE: # %bb.0:
211; SSE-NEXT: phaddd %xmm1, %xmm0
212; SSE-NEXT: retq
213;
214; AVX1-LABEL: test17_undef:
215; AVX1: # %bb.0:
216; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
217; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
218; AVX1-NEXT: retq
219;
220; AVX2-LABEL: test17_undef:
221; AVX2: # %bb.0:
222; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
223; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
224; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +0000225;
226; AVX512-LABEL: test17_undef:
227; AVX512: # %bb.0:
228; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
229; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
230; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +0000231 %vecext = extractelement <8 x i32> %a, i32 0
232 %vecext1 = extractelement <8 x i32> %a, i32 1
233 %add1 = add i32 %vecext, %vecext1
234 %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0
235 %vecext2 = extractelement <8 x i32> %a, i32 2
236 %vecext3 = extractelement <8 x i32> %a, i32 3
237 %add2 = add i32 %vecext2, %vecext3
238 %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1
239 %vecext4 = extractelement <8 x i32> %a, i32 4
240 %vecext5 = extractelement <8 x i32> %a, i32 5
241 %add3 = add i32 %vecext4, %vecext5
242 %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2
243 %vecext6 = extractelement <8 x i32> %a, i32 6
244 %vecext7 = extractelement <8 x i32> %a, i32 7
245 %add4 = add i32 %vecext6, %vecext7
246 %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3
247 ret <8 x i32> %vecinit4
248}
249
Sanjay Patelb8687c22019-01-03 22:55:18 +0000250define <16 x i32> @test17_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
251; SSE-LABEL: test17_v16i32_undef:
252; SSE: # %bb.0:
253; SSE-NEXT: phaddd %xmm1, %xmm0
254; SSE-NEXT: retq
255;
256; AVX1-LABEL: test17_v16i32_undef:
257; AVX1: # %bb.0:
258; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
259; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
260; AVX1-NEXT: retq
261;
262; AVX2-LABEL: test17_v16i32_undef:
263; AVX2: # %bb.0:
264; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
265; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
266; AVX2-NEXT: retq
267;
268; AVX512-LABEL: test17_v16i32_undef:
269; AVX512: # %bb.0:
270; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Sanjay Patel40cd4b72019-01-11 14:27:59 +0000271; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
Sanjay Patelb8687c22019-01-03 22:55:18 +0000272; AVX512-NEXT: retq
273 %vecext = extractelement <16 x i32> %a, i32 0
274 %vecext1 = extractelement <16 x i32> %a, i32 1
275 %add1 = add i32 %vecext, %vecext1
276 %vecinit1 = insertelement <16 x i32> undef, i32 %add1, i32 0
277 %vecext2 = extractelement <16 x i32> %a, i32 2
278 %vecext3 = extractelement <16 x i32> %a, i32 3
279 %add2 = add i32 %vecext2, %vecext3
280 %vecinit2 = insertelement <16 x i32> %vecinit1, i32 %add2, i32 1
281 %vecext4 = extractelement <16 x i32> %a, i32 4
282 %vecext5 = extractelement <16 x i32> %a, i32 5
283 %add3 = add i32 %vecext4, %vecext5
284 %vecinit3 = insertelement <16 x i32> %vecinit2, i32 %add3, i32 2
285 %vecext6 = extractelement <16 x i32> %a, i32 6
286 %vecext7 = extractelement <16 x i32> %a, i32 7
287 %add4 = add i32 %vecext6, %vecext7
288 %vecinit4 = insertelement <16 x i32> %vecinit3, i32 %add4, i32 3
289 ret <16 x i32> %vecinit4
290}
291