blob: 61f4ab43b5279a0948015843b02a7c12bf93e132 [file] [log] [blame]
Sanjay Patelfd58d622019-01-03 22:26:51 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
Sanjay Patelac23c462019-01-03 22:42:32 +00008; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
Sanjay Patelfd58d622019-01-03 22:26:51 +000010
11; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
12
13define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
14; SSE-LABEL: test14_undef:
15; SSE: # %bb.0:
16; SSE-NEXT: phaddd %xmm2, %xmm0
17; SSE-NEXT: retq
18;
19; AVX1-LABEL: test14_undef:
20; AVX1: # %bb.0:
21; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
22; AVX1-NEXT: retq
23;
24; AVX2-LABEL: test14_undef:
25; AVX2: # %bb.0:
26; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
27; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +000028;
29; AVX512-LABEL: test14_undef:
30; AVX512: # %bb.0:
31; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
32; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +000033 %vecext = extractelement <8 x i32> %a, i32 0
34 %vecext1 = extractelement <8 x i32> %a, i32 1
35 %add = add i32 %vecext, %vecext1
36 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
37 %vecext2 = extractelement <8 x i32> %b, i32 2
38 %vecext3 = extractelement <8 x i32> %b, i32 3
39 %add4 = add i32 %vecext2, %vecext3
40 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3
41 ret <8 x i32> %vecinit5
42}
43
44; integer horizontal adds instead of two scalar adds followed by vector inserts.
45define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
46; SSE-LABEL: test15_undef:
47; SSE: # %bb.0:
48; SSE-NEXT: movd %xmm0, %eax
49; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
50; SSE-NEXT: movd %xmm0, %ecx
51; SSE-NEXT: addl %eax, %ecx
52; SSE-NEXT: movd %xmm3, %eax
53; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
54; SSE-NEXT: movd %xmm0, %edx
55; SSE-NEXT: addl %eax, %edx
56; SSE-NEXT: movd %ecx, %xmm0
57; SSE-NEXT: movd %edx, %xmm1
58; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
59; SSE-NEXT: retq
60;
61; AVX1-LABEL: test15_undef:
62; AVX1: # %bb.0:
63; AVX1-NEXT: vmovd %xmm0, %eax
64; AVX1-NEXT: vpextrd $1, %xmm0, %ecx
65; AVX1-NEXT: addl %eax, %ecx
66; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
67; AVX1-NEXT: vmovd %xmm0, %eax
68; AVX1-NEXT: vpextrd $1, %xmm0, %edx
69; AVX1-NEXT: addl %eax, %edx
70; AVX1-NEXT: vmovd %ecx, %xmm0
71; AVX1-NEXT: vmovd %edx, %xmm1
72; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
73; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
74; AVX1-NEXT: retq
75;
76; AVX2-LABEL: test15_undef:
77; AVX2: # %bb.0:
78; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
79; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +000080;
81; AVX512-LABEL: test15_undef:
82; AVX512: # %bb.0:
83; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
84; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +000085 %vecext = extractelement <8 x i32> %a, i32 0
86 %vecext1 = extractelement <8 x i32> %a, i32 1
87 %add = add i32 %vecext, %vecext1
88 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
89 %vecext2 = extractelement <8 x i32> %b, i32 4
90 %vecext3 = extractelement <8 x i32> %b, i32 5
91 %add4 = add i32 %vecext2, %vecext3
92 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6
93 ret <8 x i32> %vecinit5
94}
95
96define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
97; SSE-LABEL: test16_undef:
98; SSE: # %bb.0:
99; SSE-NEXT: phaddd %xmm0, %xmm0
100; SSE-NEXT: retq
101;
102; AVX1-LABEL: test16_undef:
103; AVX1: # %bb.0:
104; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
105; AVX1-NEXT: retq
106;
107; AVX2-LABEL: test16_undef:
108; AVX2: # %bb.0:
109; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
110; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +0000111;
112; AVX512-LABEL: test16_undef:
113; AVX512: # %bb.0:
114; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
115; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +0000116 %vecext = extractelement <8 x i32> %a, i32 0
117 %vecext1 = extractelement <8 x i32> %a, i32 1
118 %add = add i32 %vecext, %vecext1
119 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
120 %vecext2 = extractelement <8 x i32> %a, i32 2
121 %vecext3 = extractelement <8 x i32> %a, i32 3
122 %add4 = add i32 %vecext2, %vecext3
123 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
124 ret <8 x i32> %vecinit5
125}
126
Sanjay Patelb8687c22019-01-03 22:55:18 +0000127define <16 x i32> @test16_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
128; SSE-LABEL: test16_v16i32_undef:
129; SSE: # %bb.0:
130; SSE-NEXT: phaddd %xmm0, %xmm0
131; SSE-NEXT: retq
132;
133; AVX1-LABEL: test16_v16i32_undef:
134; AVX1: # %bb.0:
135; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
136; AVX1-NEXT: retq
137;
138; AVX2-LABEL: test16_v16i32_undef:
139; AVX2: # %bb.0:
140; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
141; AVX2-NEXT: retq
142;
143; AVX512-LABEL: test16_v16i32_undef:
144; AVX512: # %bb.0:
145; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0
146; AVX512-NEXT: retq
147 %vecext = extractelement <16 x i32> %a, i32 0
148 %vecext1 = extractelement <16 x i32> %a, i32 1
149 %add = add i32 %vecext, %vecext1
150 %vecinit = insertelement <16 x i32> undef, i32 %add, i32 0
151 %vecext2 = extractelement <16 x i32> %a, i32 2
152 %vecext3 = extractelement <16 x i32> %a, i32 3
153 %add4 = add i32 %vecext2, %vecext3
154 %vecinit5 = insertelement <16 x i32> %vecinit, i32 %add4, i32 1
155 ret <16 x i32> %vecinit5
156}
157
Sanjay Patelfd58d622019-01-03 22:26:51 +0000158define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
159; SSE-LABEL: test17_undef:
160; SSE: # %bb.0:
161; SSE-NEXT: phaddd %xmm1, %xmm0
162; SSE-NEXT: retq
163;
164; AVX1-LABEL: test17_undef:
165; AVX1: # %bb.0:
166; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
167; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
168; AVX1-NEXT: retq
169;
170; AVX2-LABEL: test17_undef:
171; AVX2: # %bb.0:
172; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
173; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
174; AVX2-NEXT: retq
Sanjay Patelac23c462019-01-03 22:42:32 +0000175;
176; AVX512-LABEL: test17_undef:
177; AVX512: # %bb.0:
178; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
179; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
180; AVX512-NEXT: retq
Sanjay Patelfd58d622019-01-03 22:26:51 +0000181 %vecext = extractelement <8 x i32> %a, i32 0
182 %vecext1 = extractelement <8 x i32> %a, i32 1
183 %add1 = add i32 %vecext, %vecext1
184 %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0
185 %vecext2 = extractelement <8 x i32> %a, i32 2
186 %vecext3 = extractelement <8 x i32> %a, i32 3
187 %add2 = add i32 %vecext2, %vecext3
188 %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1
189 %vecext4 = extractelement <8 x i32> %a, i32 4
190 %vecext5 = extractelement <8 x i32> %a, i32 5
191 %add3 = add i32 %vecext4, %vecext5
192 %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2
193 %vecext6 = extractelement <8 x i32> %a, i32 6
194 %vecext7 = extractelement <8 x i32> %a, i32 7
195 %add4 = add i32 %vecext6, %vecext7
196 %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3
197 ret <8 x i32> %vecinit4
198}
199
Sanjay Patelb8687c22019-01-03 22:55:18 +0000200define <16 x i32> @test17_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
201; SSE-LABEL: test17_v16i32_undef:
202; SSE: # %bb.0:
203; SSE-NEXT: phaddd %xmm1, %xmm0
204; SSE-NEXT: retq
205;
206; AVX1-LABEL: test17_v16i32_undef:
207; AVX1: # %bb.0:
208; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
209; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
210; AVX1-NEXT: retq
211;
212; AVX2-LABEL: test17_v16i32_undef:
213; AVX2: # %bb.0:
214; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
215; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
216; AVX2-NEXT: retq
217;
218; AVX512-LABEL: test17_v16i32_undef:
219; AVX512: # %bb.0:
220; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
221; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
222; AVX512-NEXT: retq
223 %vecext = extractelement <16 x i32> %a, i32 0
224 %vecext1 = extractelement <16 x i32> %a, i32 1
225 %add1 = add i32 %vecext, %vecext1
226 %vecinit1 = insertelement <16 x i32> undef, i32 %add1, i32 0
227 %vecext2 = extractelement <16 x i32> %a, i32 2
228 %vecext3 = extractelement <16 x i32> %a, i32 3
229 %add2 = add i32 %vecext2, %vecext3
230 %vecinit2 = insertelement <16 x i32> %vecinit1, i32 %add2, i32 1
231 %vecext4 = extractelement <16 x i32> %a, i32 4
232 %vecext5 = extractelement <16 x i32> %a, i32 5
233 %add3 = add i32 %vecext4, %vecext5
234 %vecinit3 = insertelement <16 x i32> %vecinit2, i32 %add3, i32 2
235 %vecext6 = extractelement <16 x i32> %a, i32 6
236 %vecext7 = extractelement <16 x i32> %a, i32 7
237 %add4 = add i32 %vecext6, %vecext7
238 %vecinit4 = insertelement <16 x i32> %vecinit3, i32 %add4, i32 3
239 ret <16 x i32> %vecinit4
240}
241