blob: 161d057af1d7d53a0f4c4eaa4c2d95e053307622 [file] [log] [blame]
Sanjay Patelfd58d622019-01-03 22:26:51 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
8
9; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
10
11define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
12; SSE-LABEL: test14_undef:
13; SSE: # %bb.0:
14; SSE-NEXT: phaddd %xmm2, %xmm0
15; SSE-NEXT: retq
16;
17; AVX1-LABEL: test14_undef:
18; AVX1: # %bb.0:
19; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
20; AVX1-NEXT: retq
21;
22; AVX2-LABEL: test14_undef:
23; AVX2: # %bb.0:
24; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
25; AVX2-NEXT: retq
26 %vecext = extractelement <8 x i32> %a, i32 0
27 %vecext1 = extractelement <8 x i32> %a, i32 1
28 %add = add i32 %vecext, %vecext1
29 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
30 %vecext2 = extractelement <8 x i32> %b, i32 2
31 %vecext3 = extractelement <8 x i32> %b, i32 3
32 %add4 = add i32 %vecext2, %vecext3
33 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3
34 ret <8 x i32> %vecinit5
35}
36
37; integer horizontal adds instead of two scalar adds followed by vector inserts.
38define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
39; SSE-LABEL: test15_undef:
40; SSE: # %bb.0:
41; SSE-NEXT: movd %xmm0, %eax
42; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
43; SSE-NEXT: movd %xmm0, %ecx
44; SSE-NEXT: addl %eax, %ecx
45; SSE-NEXT: movd %xmm3, %eax
46; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
47; SSE-NEXT: movd %xmm0, %edx
48; SSE-NEXT: addl %eax, %edx
49; SSE-NEXT: movd %ecx, %xmm0
50; SSE-NEXT: movd %edx, %xmm1
51; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
52; SSE-NEXT: retq
53;
54; AVX1-LABEL: test15_undef:
55; AVX1: # %bb.0:
56; AVX1-NEXT: vmovd %xmm0, %eax
57; AVX1-NEXT: vpextrd $1, %xmm0, %ecx
58; AVX1-NEXT: addl %eax, %ecx
59; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
60; AVX1-NEXT: vmovd %xmm0, %eax
61; AVX1-NEXT: vpextrd $1, %xmm0, %edx
62; AVX1-NEXT: addl %eax, %edx
63; AVX1-NEXT: vmovd %ecx, %xmm0
64; AVX1-NEXT: vmovd %edx, %xmm1
65; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
66; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
67; AVX1-NEXT: retq
68;
69; AVX2-LABEL: test15_undef:
70; AVX2: # %bb.0:
71; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
72; AVX2-NEXT: retq
73 %vecext = extractelement <8 x i32> %a, i32 0
74 %vecext1 = extractelement <8 x i32> %a, i32 1
75 %add = add i32 %vecext, %vecext1
76 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
77 %vecext2 = extractelement <8 x i32> %b, i32 4
78 %vecext3 = extractelement <8 x i32> %b, i32 5
79 %add4 = add i32 %vecext2, %vecext3
80 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6
81 ret <8 x i32> %vecinit5
82}
83
84define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
85; SSE-LABEL: test16_undef:
86; SSE: # %bb.0:
87; SSE-NEXT: phaddd %xmm0, %xmm0
88; SSE-NEXT: retq
89;
90; AVX1-LABEL: test16_undef:
91; AVX1: # %bb.0:
92; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
93; AVX1-NEXT: retq
94;
95; AVX2-LABEL: test16_undef:
96; AVX2: # %bb.0:
97; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
98; AVX2-NEXT: retq
99 %vecext = extractelement <8 x i32> %a, i32 0
100 %vecext1 = extractelement <8 x i32> %a, i32 1
101 %add = add i32 %vecext, %vecext1
102 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
103 %vecext2 = extractelement <8 x i32> %a, i32 2
104 %vecext3 = extractelement <8 x i32> %a, i32 3
105 %add4 = add i32 %vecext2, %vecext3
106 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
107 ret <8 x i32> %vecinit5
108}
109
110define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
111; SSE-LABEL: test17_undef:
112; SSE: # %bb.0:
113; SSE-NEXT: phaddd %xmm1, %xmm0
114; SSE-NEXT: retq
115;
116; AVX1-LABEL: test17_undef:
117; AVX1: # %bb.0:
118; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
119; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
120; AVX1-NEXT: retq
121;
122; AVX2-LABEL: test17_undef:
123; AVX2: # %bb.0:
124; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
125; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
126; AVX2-NEXT: retq
127 %vecext = extractelement <8 x i32> %a, i32 0
128 %vecext1 = extractelement <8 x i32> %a, i32 1
129 %add1 = add i32 %vecext, %vecext1
130 %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0
131 %vecext2 = extractelement <8 x i32> %a, i32 2
132 %vecext3 = extractelement <8 x i32> %a, i32 3
133 %add2 = add i32 %vecext2, %vecext3
134 %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1
135 %vecext4 = extractelement <8 x i32> %a, i32 4
136 %vecext5 = extractelement <8 x i32> %a, i32 5
137 %add3 = add i32 %vecext4, %vecext5
138 %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2
139 %vecext6 = extractelement <8 x i32> %a, i32 6
140 %vecext7 = extractelement <8 x i32> %a, i32 7
141 %add4 = add i32 %vecext6, %vecext7
142 %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3
143 ret <8 x i32> %vecinit4
144}
145