blob: dd0961769e7ed2e80b8d8730f9f56912bea04ac3 [file] [log] [blame]
Chandler Carruth3c7bf042014-10-02 07:22:26 +00001; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
3; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
Chandler Carruth7b270672014-10-02 07:13:25 +00006;
7; Verify that the DAG combiner correctly folds bitwise operations across
8; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
9; basic and always-safe patterns. Also test that the DAG combiner will combine
10; target-specific shuffle instructions where reasonable.
Chandler Carruth0d6d1f22014-06-27 11:34:40 +000011
Chandler Carruth0d6d1f22014-06-27 11:34:40 +000012target triple = "x86_64-unknown-unknown"
13
Chandler Carruth688001f2014-06-27 11:40:13 +000014declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
Chandler Carruth0d6d1f22014-06-27 11:34:40 +000015declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
16declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
17
Chandler Carruth688001f2014-06-27 11:40:13 +000018define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
Chandler Carruth3c7bf042014-10-02 07:22:26 +000019; ALL-LABEL: combine_pshufd1:
20; ALL: # BB#0: # %entry
21; ALL-NEXT: retq
22entry:
23 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
24 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
Chandler Carruth688001f2014-06-27 11:40:13 +000025 ret <4 x i32> %c
26}
27
28define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
Chandler Carruth3c7bf042014-10-02 07:22:26 +000029; ALL-LABEL: combine_pshufd2:
30; ALL: # BB#0: # %entry
31; ALL-NEXT: retq
32entry:
33 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
Chandler Carruth688001f2014-06-27 11:40:13 +000034 %b.cast = bitcast <4 x i32> %b to <8 x i16>
35 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
36 %c.cast = bitcast <8 x i16> %c to <4 x i32>
Chandler Carruth3c7bf042014-10-02 07:22:26 +000037 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
Chandler Carruth688001f2014-06-27 11:40:13 +000038 ret <4 x i32> %d
39}
40
41define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
Chandler Carruth3c7bf042014-10-02 07:22:26 +000042; ALL-LABEL: combine_pshufd3:
43; ALL: # BB#0: # %entry
44; ALL-NEXT: retq
45entry:
46 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
Chandler Carruth688001f2014-06-27 11:40:13 +000047 %b.cast = bitcast <4 x i32> %b to <8 x i16>
48 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
49 %c.cast = bitcast <8 x i16> %c to <4 x i32>
Chandler Carruth3c7bf042014-10-02 07:22:26 +000050 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
Chandler Carruth688001f2014-06-27 11:40:13 +000051 ret <4 x i32> %d
52}
53
54define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
Chandler Carruth3c7bf042014-10-02 07:22:26 +000055; SSE-LABEL: combine_pshufd4:
56; SSE: # BB#0: # %entry
57; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
58; SSE-NEXT: retq
59;
60; AVX-LABEL: combine_pshufd4:
61; AVX: # BB#0: # %entry
62; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
63; AVX-NEXT: retq
64entry:
65 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
Chandler Carruth688001f2014-06-27 11:40:13 +000066 %b.cast = bitcast <4 x i32> %b to <8 x i16>
67 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
68 %c.cast = bitcast <8 x i16> %c to <4 x i32>
Chandler Carruth3c7bf042014-10-02 07:22:26 +000069 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
Chandler Carruth688001f2014-06-27 11:40:13 +000070 ret <4 x i32> %d
71}
72
73define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
Chandler Carruth3c7bf042014-10-02 07:22:26 +000074; SSE-LABEL: combine_pshufd5:
75; SSE: # BB#0: # %entry
76; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
77; SSE-NEXT: retq
78;
79; AVX-LABEL: combine_pshufd5:
80; AVX: # BB#0: # %entry
81; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
82; AVX-NEXT: retq
83entry:
84 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
Chandler Carruth688001f2014-06-27 11:40:13 +000085 %b.cast = bitcast <4 x i32> %b to <8 x i16>
86 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
87 %c.cast = bitcast <8 x i16> %c to <4 x i32>
88 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
89 ret <4 x i32> %d
90}
91
Benjamin Kramere739cf32014-07-02 15:09:44 +000092define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
Chandler Carruth3c7bf042014-10-02 07:22:26 +000093; SSE-LABEL: combine_pshufd6:
94; SSE: # BB#0: # %entry
95; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
96; SSE-NEXT: retq
97;
98; AVX-LABEL: combine_pshufd6:
99; AVX: # BB#0: # %entry
100; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
101; AVX-NEXT: retq
102entry:
Benjamin Kramere739cf32014-07-02 15:09:44 +0000103 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
104 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
105 ret <4 x i32> %c
106}
107
Chandler Carruth0d6d1f22014-06-27 11:34:40 +0000108define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
Chandler Carruth3c7bf042014-10-02 07:22:26 +0000109; ALL-LABEL: combine_pshuflw1:
110; ALL: # BB#0: # %entry
111; ALL-NEXT: retq
112entry:
113 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
114 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
Chandler Carruth0d6d1f22014-06-27 11:34:40 +0000115 ret <8 x i16> %c
116}
117
118define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
Chandler Carruth3c7bf042014-10-02 07:22:26 +0000119; ALL-LABEL: combine_pshuflw2:
120; ALL: # BB#0: # %entry
121; ALL-NEXT: retq
122entry:
Chandler Carruth0d6d1f22014-06-27 11:34:40 +0000123 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
Chandler Carruth3c7bf042014-10-02 07:22:26 +0000124 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
125 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
Chandler Carruth0d6d1f22014-06-27 11:34:40 +0000126 ret <8 x i16> %d
127}
128
129define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
Chandler Carruth3c7bf042014-10-02 07:22:26 +0000130; SSE-LABEL: combine_pshuflw3:
131; SSE: # BB#0: # %entry
132; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
133; SSE-NEXT: retq
134;
135; AVX-LABEL: combine_pshuflw3:
136; AVX: # BB#0: # %entry
137; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
138; AVX-NEXT: retq
139entry:
Chandler Carruth0d6d1f22014-06-27 11:34:40 +0000140 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
Chandler Carruth3c7bf042014-10-02 07:22:26 +0000141 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
142 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
Chandler Carruth0d6d1f22014-06-27 11:34:40 +0000143 ret <8 x i16> %d
144}
145
146define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
Chandler Carruth3c7bf042014-10-02 07:22:26 +0000147; SSE-LABEL: combine_pshufhw1:
148; SSE: # BB#0: # %entry
149; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
150; SSE-NEXT: retq
151;
152; AVX-LABEL: combine_pshufhw1:
153; AVX: # BB#0: # %entry
154; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
155; AVX-NEXT: retq
156entry:
Chandler Carruth0d6d1f22014-06-27 11:34:40 +0000157 %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
Chandler Carruth3c7bf042014-10-02 07:22:26 +0000158 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
159 %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
Chandler Carruth0d6d1f22014-06-27 11:34:40 +0000160 ret <8 x i16> %d
161}
Chandler Carruth21105012014-10-02 07:30:24 +0000162
163define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
164; SSE-LABEL: combine_bitwise_ops_test1:
165; SSE: # BB#0:
166; SSE-NEXT: pand %xmm1, %xmm0
167; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
168; SSE-NEXT: retq
169;
170; AVX-LABEL: combine_bitwise_ops_test1:
171; AVX: # BB#0:
172; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
173; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
174; AVX-NEXT: retq
175 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
176 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
177 %and = and <4 x i32> %shuf1, %shuf2
178 ret <4 x i32> %and
179}
180
181define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
182; SSE-LABEL: combine_bitwise_ops_test2:
183; SSE: # BB#0:
184; SSE-NEXT: por %xmm1, %xmm0
185; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
186; SSE-NEXT: retq
187;
188; AVX-LABEL: combine_bitwise_ops_test2:
189; AVX: # BB#0:
190; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
191; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
192; AVX-NEXT: retq
193 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
194 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
195 %or = or <4 x i32> %shuf1, %shuf2
196 ret <4 x i32> %or
197}
198
199define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
200; SSE-LABEL: combine_bitwise_ops_test3:
201; SSE: # BB#0:
202; SSE-NEXT: pxor %xmm1, %xmm0
203; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
204; SSE-NEXT: retq
205;
206; AVX-LABEL: combine_bitwise_ops_test3:
207; AVX: # BB#0:
208; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
209; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
210; AVX-NEXT: retq
211 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
212 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
213 %xor = xor <4 x i32> %shuf1, %shuf2
214 ret <4 x i32> %xor
215}
216
217define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
218; SSE-LABEL: combine_bitwise_ops_test4:
219; SSE: # BB#0:
220; SSE-NEXT: pand %xmm1, %xmm0
221; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
222; SSE-NEXT: retq
223;
224; AVX-LABEL: combine_bitwise_ops_test4:
225; AVX: # BB#0:
226; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
227; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
228; AVX-NEXT: retq
229 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
230 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
231 %and = and <4 x i32> %shuf1, %shuf2
232 ret <4 x i32> %and
233}
234
235define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
236; SSE-LABEL: combine_bitwise_ops_test5:
237; SSE: # BB#0:
238; SSE-NEXT: por %xmm1, %xmm0
239; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
240; SSE-NEXT: retq
241;
242; AVX-LABEL: combine_bitwise_ops_test5:
243; AVX: # BB#0:
244; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
245; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
246; AVX-NEXT: retq
247 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
248 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
249 %or = or <4 x i32> %shuf1, %shuf2
250 ret <4 x i32> %or
251}
252
253define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
254; SSE-LABEL: combine_bitwise_ops_test6:
255; SSE: # BB#0:
256; SSE-NEXT: pxor %xmm1, %xmm0
257; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
258; SSE-NEXT: retq
259;
260; AVX-LABEL: combine_bitwise_ops_test6:
261; AVX: # BB#0:
262; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
263; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
264; AVX-NEXT: retq
265 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
266 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
267 %xor = xor <4 x i32> %shuf1, %shuf2
268 ret <4 x i32> %xor
269}
270
271
272; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
273; are not performing a swizzle operations.
274
275define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
276; SSE2-LABEL: combine_bitwise_ops_test1b:
277; SSE2: # BB#0:
278; SSE2-NEXT: andps %xmm1, %xmm0
279; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
280; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
281; SSE2-NEXT: retq
282;
283; SSSE3-LABEL: combine_bitwise_ops_test1b:
284; SSSE3: # BB#0:
285; SSSE3-NEXT: andps %xmm1, %xmm0
286; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
287; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
288; SSSE3-NEXT: retq
289;
290; SSE41-LABEL: combine_bitwise_ops_test1b:
291; SSE41: # BB#0:
292; SSE41-NEXT: andps %xmm1, %xmm0
293; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
294; SSE41-NEXT: retq
295;
296; AVX1-LABEL: combine_bitwise_ops_test1b:
297; AVX1: # BB#0:
298; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
299; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
300; AVX1-NEXT: retq
301;
302; AVX2-LABEL: combine_bitwise_ops_test1b:
303; AVX2: # BB#0:
304; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
305; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
306; AVX2-NEXT: retq
307 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
308 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
309 %and = and <4 x i32> %shuf1, %shuf2
310 ret <4 x i32> %and
311}
312
313define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
314; SSE2-LABEL: combine_bitwise_ops_test2b:
315; SSE2: # BB#0:
316; SSE2-NEXT: orps %xmm1, %xmm0
317; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
318; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
319; SSE2-NEXT: retq
320;
321; SSSE3-LABEL: combine_bitwise_ops_test2b:
322; SSSE3: # BB#0:
323; SSSE3-NEXT: orps %xmm1, %xmm0
324; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
325; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
326; SSSE3-NEXT: retq
327;
328; SSE41-LABEL: combine_bitwise_ops_test2b:
329; SSE41: # BB#0:
330; SSE41-NEXT: orps %xmm1, %xmm0
331; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
332; SSE41-NEXT: retq
333;
334; AVX1-LABEL: combine_bitwise_ops_test2b:
335; AVX1: # BB#0:
336; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
337; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
338; AVX1-NEXT: retq
339;
340; AVX2-LABEL: combine_bitwise_ops_test2b:
341; AVX2: # BB#0:
342; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
343; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
344; AVX2-NEXT: retq
345 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
346 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
347 %or = or <4 x i32> %shuf1, %shuf2
348 ret <4 x i32> %or
349}
350
351define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
352; SSE2-LABEL: combine_bitwise_ops_test3b:
353; SSE2: # BB#0:
354; SSE2-NEXT: xorps %xmm1, %xmm0
355; SSE2-NEXT: xorps %xmm1, %xmm1
356; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
357; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
358; SSE2-NEXT: retq
359;
360; SSSE3-LABEL: combine_bitwise_ops_test3b:
361; SSSE3: # BB#0:
362; SSSE3-NEXT: xorps %xmm1, %xmm0
363; SSSE3-NEXT: xorps %xmm1, %xmm1
364; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
365; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
366; SSSE3-NEXT: retq
367;
368; SSE41-LABEL: combine_bitwise_ops_test3b:
369; SSE41: # BB#0:
370; SSE41-NEXT: xorps %xmm1, %xmm0
371; SSE41-NEXT: xorps %xmm1, %xmm1
372; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
373; SSE41-NEXT: retq
374;
375; AVX1-LABEL: combine_bitwise_ops_test3b:
376; AVX1: # BB#0:
377; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
378; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
379; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
380; AVX1-NEXT: retq
381;
382; AVX2-LABEL: combine_bitwise_ops_test3b:
383; AVX2: # BB#0:
384; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
385; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
386; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
387; AVX2-NEXT: retq
388 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
389 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
390 %xor = xor <4 x i32> %shuf1, %shuf2
391 ret <4 x i32> %xor
392}
393
394define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
395; SSE2-LABEL: combine_bitwise_ops_test4b:
396; SSE2: # BB#0:
397; SSE2-NEXT: andps %xmm1, %xmm0
398; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
399; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
400; SSE2-NEXT: retq
401;
402; SSSE3-LABEL: combine_bitwise_ops_test4b:
403; SSSE3: # BB#0:
404; SSSE3-NEXT: andps %xmm1, %xmm0
405; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
406; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
407; SSSE3-NEXT: retq
408;
409; SSE41-LABEL: combine_bitwise_ops_test4b:
410; SSE41: # BB#0:
411; SSE41-NEXT: andps %xmm1, %xmm0
412; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
413; SSE41-NEXT: movaps %xmm2, %xmm0
414; SSE41-NEXT: retq
415;
416; AVX1-LABEL: combine_bitwise_ops_test4b:
417; AVX1: # BB#0:
418; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
419; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
420; AVX1-NEXT: retq
421;
422; AVX2-LABEL: combine_bitwise_ops_test4b:
423; AVX2: # BB#0:
424; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
425; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
426; AVX2-NEXT: retq
427 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
428 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
429 %and = and <4 x i32> %shuf1, %shuf2
430 ret <4 x i32> %and
431}
432
433define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
434; SSE2-LABEL: combine_bitwise_ops_test5b:
435; SSE2: # BB#0:
436; SSE2-NEXT: orps %xmm1, %xmm0
437; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
438; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
439; SSE2-NEXT: retq
440;
441; SSSE3-LABEL: combine_bitwise_ops_test5b:
442; SSSE3: # BB#0:
443; SSSE3-NEXT: orps %xmm1, %xmm0
444; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
445; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
446; SSSE3-NEXT: retq
447;
448; SSE41-LABEL: combine_bitwise_ops_test5b:
449; SSE41: # BB#0:
450; SSE41-NEXT: orps %xmm1, %xmm0
451; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
452; SSE41-NEXT: movaps %xmm2, %xmm0
453; SSE41-NEXT: retq
454;
455; AVX1-LABEL: combine_bitwise_ops_test5b:
456; AVX1: # BB#0:
457; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
458; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
459; AVX1-NEXT: retq
460;
461; AVX2-LABEL: combine_bitwise_ops_test5b:
462; AVX2: # BB#0:
463; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
464; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
465; AVX2-NEXT: retq
466 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
467 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
468 %or = or <4 x i32> %shuf1, %shuf2
469 ret <4 x i32> %or
470}
471
472define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
473; SSE2-LABEL: combine_bitwise_ops_test6b:
474; SSE2: # BB#0:
475; SSE2-NEXT: xorps %xmm1, %xmm0
476; SSE2-NEXT: xorps %xmm1, %xmm1
477; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
478; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3]
479; SSE2-NEXT: retq
480;
481; SSSE3-LABEL: combine_bitwise_ops_test6b:
482; SSSE3: # BB#0:
483; SSSE3-NEXT: xorps %xmm1, %xmm0
484; SSSE3-NEXT: xorps %xmm1, %xmm1
485; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
486; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3]
487; SSSE3-NEXT: retq
488;
489; SSE41-LABEL: combine_bitwise_ops_test6b:
490; SSE41: # BB#0:
491; SSE41-NEXT: xorps %xmm1, %xmm0
492; SSE41-NEXT: xorps %xmm1, %xmm1
493; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
494; SSE41-NEXT: movaps %xmm1, %xmm0
495; SSE41-NEXT: retq
496;
497; AVX1-LABEL: combine_bitwise_ops_test6b:
498; AVX1: # BB#0:
499; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
500; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
501; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
502; AVX1-NEXT: retq
503;
504; AVX2-LABEL: combine_bitwise_ops_test6b:
505; AVX2: # BB#0:
506; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
507; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
508; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
509; AVX2-NEXT: retq
510 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
511 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
512 %xor = xor <4 x i32> %shuf1, %shuf2
513 ret <4 x i32> %xor
514}
515
516define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
517; SSE-LABEL: combine_bitwise_ops_test1c:
518; SSE: # BB#0:
519; SSE-NEXT: andps %xmm1, %xmm0
520; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
521; SSE-NEXT: retq
522;
523; AVX-LABEL: combine_bitwise_ops_test1c:
524; AVX: # BB#0:
525; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
526; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
527; AVX-NEXT: retq
528 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
529 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
530 %and = and <4 x i32> %shuf1, %shuf2
531 ret <4 x i32> %and
532}
533
534define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
535; SSE-LABEL: combine_bitwise_ops_test2c:
536; SSE: # BB#0:
537; SSE-NEXT: orps %xmm1, %xmm0
538; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
539; SSE-NEXT: retq
540;
541; AVX-LABEL: combine_bitwise_ops_test2c:
542; AVX: # BB#0:
543; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
544; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
545; AVX-NEXT: retq
546 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
547 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
548 %or = or <4 x i32> %shuf1, %shuf2
549 ret <4 x i32> %or
550}
551
552define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
553; SSE-LABEL: combine_bitwise_ops_test3c:
554; SSE: # BB#0:
555; SSE-NEXT: xorps %xmm1, %xmm0
556; SSE-NEXT: xorps %xmm1, %xmm1
557; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
558; SSE-NEXT: retq
559;
560; AVX-LABEL: combine_bitwise_ops_test3c:
561; AVX: # BB#0:
562; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
563; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
564; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
565; AVX-NEXT: retq
566 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
567 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
568 %xor = xor <4 x i32> %shuf1, %shuf2
569 ret <4 x i32> %xor
570}
571
572define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
573; SSE-LABEL: combine_bitwise_ops_test4c:
574; SSE: # BB#0:
575; SSE-NEXT: andps %xmm1, %xmm0
576; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
577; SSE-NEXT: movaps %xmm2, %xmm0
578; SSE-NEXT: retq
579;
580; AVX-LABEL: combine_bitwise_ops_test4c:
581; AVX: # BB#0:
582; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
583; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
584; AVX-NEXT: retq
585 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
586 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
587 %and = and <4 x i32> %shuf1, %shuf2
588 ret <4 x i32> %and
589}
590
591define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
592; SSE-LABEL: combine_bitwise_ops_test5c:
593; SSE: # BB#0:
594; SSE-NEXT: orps %xmm1, %xmm0
595; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
596; SSE-NEXT: movaps %xmm2, %xmm0
597; SSE-NEXT: retq
598;
599; AVX-LABEL: combine_bitwise_ops_test5c:
600; AVX: # BB#0:
601; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
602; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
603; AVX-NEXT: retq
604 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
605 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
606 %or = or <4 x i32> %shuf1, %shuf2
607 ret <4 x i32> %or
608}
609
610define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
611; SSE-LABEL: combine_bitwise_ops_test6c:
612; SSE: # BB#0:
613; SSE-NEXT: xorps %xmm1, %xmm0
614; SSE-NEXT: xorps %xmm1, %xmm1
615; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
616; SSE-NEXT: movaps %xmm1, %xmm0
617; SSE-NEXT: retq
618;
619; AVX-LABEL: combine_bitwise_ops_test6c:
620; AVX: # BB#0:
621; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
622; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
623; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3]
624; AVX-NEXT: retq
625 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
626 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
627 %xor = xor <4 x i32> %shuf1, %shuf2
628 ret <4 x i32> %xor
629}
Chandler Carruthb2941e22014-10-02 07:42:58 +0000630
631define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
632; SSE-LABEL: combine_nested_undef_test1:
633; SSE: # BB#0:
634; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,1]
635; SSE-NEXT: retq
636;
637; AVX-LABEL: combine_nested_undef_test1:
638; AVX: # BB#0:
639; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,0,1]
640; AVX-NEXT: retq
641 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
642 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
643 ret <4 x i32> %2
644}
645
646define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
647; SSE-LABEL: combine_nested_undef_test2:
648; SSE: # BB#0:
649; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,0,3]
650; SSE-NEXT: retq
651;
652; AVX-LABEL: combine_nested_undef_test2:
653; AVX: # BB#0:
654; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,3]
655; AVX-NEXT: retq
656 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
657 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
658 ret <4 x i32> %2
659}
660
661define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
662; SSE-LABEL: combine_nested_undef_test3:
663; SSE: # BB#0:
664; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,0,3]
665; SSE-NEXT: retq
666;
667; AVX-LABEL: combine_nested_undef_test3:
668; AVX: # BB#0:
669; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,3]
670; AVX-NEXT: retq
671 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
672 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
673 ret <4 x i32> %2
674}
675
676define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
677; SSE-LABEL: combine_nested_undef_test4:
678; SSE: # BB#0:
679; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
680; SSE-NEXT: retq
681;
682; AVX-LABEL: combine_nested_undef_test4:
683; AVX: # BB#0:
684; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
685; AVX-NEXT: retq
686 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
687 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
688 ret <4 x i32> %2
689}
690
691define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
692; SSE-LABEL: combine_nested_undef_test5:
693; SSE: # BB#0:
694; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
695; SSE-NEXT: retq
696;
697; AVX-LABEL: combine_nested_undef_test5:
698; AVX: # BB#0:
699; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
700; AVX-NEXT: retq
701 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
702 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
703 ret <4 x i32> %2
704}
705
706define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
707; SSE-LABEL: combine_nested_undef_test6:
708; SSE: # BB#0:
709; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,0,0]
710; SSE-NEXT: retq
711;
712; AVX-LABEL: combine_nested_undef_test6:
713; AVX: # BB#0:
714; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,0]
715; AVX-NEXT: retq
716 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
717 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
718 ret <4 x i32> %2
719}
720
721define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
722; SSE-LABEL: combine_nested_undef_test7:
723; SSE: # BB#0:
724; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
725; SSE-NEXT: retq
726;
727; AVX-LABEL: combine_nested_undef_test7:
728; AVX: # BB#0:
729; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
730; AVX-NEXT: retq
731 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
732 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
733 ret <4 x i32> %2
734}
735
736define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
737; SSE-LABEL: combine_nested_undef_test8:
738; SSE: # BB#0:
739; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,0]
740; SSE-NEXT: retq
741;
742; AVX-LABEL: combine_nested_undef_test8:
743; AVX: # BB#0:
744; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,0]
745; AVX-NEXT: retq
746 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
747 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
748 ret <4 x i32> %2
749}
750
751define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
752; SSE-LABEL: combine_nested_undef_test9:
753; SSE: # BB#0:
754; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,2]
755; SSE-NEXT: retq
756;
757; AVX-LABEL: combine_nested_undef_test9:
758; AVX: # BB#0:
759; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,0,2]
760; AVX-NEXT: retq
761 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
762 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
763 ret <4 x i32> %2
764}
765
766define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
767; SSE-LABEL: combine_nested_undef_test10:
768; SSE: # BB#0:
769; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
770; SSE-NEXT: retq
771;
772; AVX-LABEL: combine_nested_undef_test10:
773; AVX: # BB#0:
774; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
775; AVX-NEXT: retq
776 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
777 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
778 ret <4 x i32> %2
779}
780
781define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
782; SSE-LABEL: combine_nested_undef_test11:
783; SSE: # BB#0:
784; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,1]
785; SSE-NEXT: retq
786;
787; AVX-LABEL: combine_nested_undef_test11:
788; AVX: # BB#0:
789; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,1]
790; AVX-NEXT: retq
791 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
792 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
793 ret <4 x i32> %2
794}
795
796define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
797; SSE-LABEL: combine_nested_undef_test12:
798; SSE: # BB#0:
799; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
800; SSE-NEXT: retq
801;
802; AVX1-LABEL: combine_nested_undef_test12:
803; AVX1: # BB#0:
804; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
805; AVX1-NEXT: retq
806;
807; AVX2-LABEL: combine_nested_undef_test12:
808; AVX2: # BB#0:
809; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
810; AVX2-NEXT: retq
811 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
812 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
813 ret <4 x i32> %2
814}
815
816; The following pair of shuffles is folded into vector %A.
817define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
818; ALL-LABEL: combine_nested_undef_test13:
819; ALL: # BB#0:
820; ALL-NEXT: retq
821 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
822 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
823 ret <4 x i32> %2
824}
825
826; The following pair of shuffles is folded into vector %B.
827define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
828; SSE-LABEL: combine_nested_undef_test14:
829; SSE: # BB#0:
830; SSE-NEXT: movaps %xmm1, %xmm0
831; SSE-NEXT: retq
832;
833; AVX-LABEL: combine_nested_undef_test14:
834; AVX: # BB#0:
835; AVX-NEXT: vmovaps %xmm1, %xmm0
836; AVX-NEXT: retq
837 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
838 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
839 ret <4 x i32> %2
840}
841
842
843; Verify that we don't optimize the following cases. We expect more than one shuffle.
844;
845; FIXME: Many of these already don't make sense, and the rest should stop
846; making sense with th enew vector shuffle lowering. Revisit at least testing for
847; it.
848
849define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
850; SSE-LABEL: combine_nested_undef_test15:
851; SSE: # BB#0:
852; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
853; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,1]
854; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3]
855; SSE-NEXT: retq
856;
857; AVX-LABEL: combine_nested_undef_test15:
858; AVX: # BB#0:
859; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
860; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[3,1]
861; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
862; AVX-NEXT: retq
863 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
864 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
865 ret <4 x i32> %2
866}
867
868define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
869; SSE2-LABEL: combine_nested_undef_test16:
870; SSE2: # BB#0:
871; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
872; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,0,3]
873; SSE2-NEXT: retq
874;
875; SSSE3-LABEL: combine_nested_undef_test16:
876; SSSE3: # BB#0:
877; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
878; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,0,3]
879; SSSE3-NEXT: retq
880;
881; SSE41-LABEL: combine_nested_undef_test16:
882; SSE41: # BB#0:
883; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
884; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
885; SSE41-NEXT: retq
886;
887; AVX1-LABEL: combine_nested_undef_test16:
888; AVX1: # BB#0:
889; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
890; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
891; AVX1-NEXT: retq
892;
893; AVX2-LABEL: combine_nested_undef_test16:
894; AVX2: # BB#0:
895; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
896; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
897; AVX2-NEXT: retq
898 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
899 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
900 ret <4 x i32> %2
901}
902
903define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
904; SSE-LABEL: combine_nested_undef_test17:
905; SSE: # BB#0:
906; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
907; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1]
908; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3]
909; SSE-NEXT: retq
910;
911; AVX-LABEL: combine_nested_undef_test17:
912; AVX: # BB#0:
913; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
914; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1]
915; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
916; AVX-NEXT: retq
917 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
918 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
919 ret <4 x i32> %2
920}
921
922define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
923; SSE-LABEL: combine_nested_undef_test18:
924; SSE: # BB#0:
925; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
926; SSE-NEXT: retq
927;
928; AVX-LABEL: combine_nested_undef_test18:
929; AVX: # BB#0:
930; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
931; AVX-NEXT: retq
932 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
933 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
934 ret <4 x i32> %2
935}
936
937define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
938; SSE-LABEL: combine_nested_undef_test19:
939; SSE: # BB#0:
940; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
941; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
942; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,0,0]
943; SSE-NEXT: retq
944;
945; AVX-LABEL: combine_nested_undef_test19:
946; AVX: # BB#0:
947; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
948; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
949; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,0]
950; AVX-NEXT: retq
951 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
952 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
953 ret <4 x i32> %2
954}
955
956define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
957; SSE-LABEL: combine_nested_undef_test20:
958; SSE: # BB#0:
959; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0]
960; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
961; SSE-NEXT: retq
962;
963; AVX-LABEL: combine_nested_undef_test20:
964; AVX: # BB#0:
965; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0]
966; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
967; AVX-NEXT: retq
968 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
969 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
970 ret <4 x i32> %2
971}
972
973define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
974; SSE-LABEL: combine_nested_undef_test21:
975; SSE: # BB#0:
976; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
977; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1]
978; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
979; SSE-NEXT: retq
980;
981; AVX-LABEL: combine_nested_undef_test21:
982; AVX: # BB#0:
983; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
984; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1]
985; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
986; AVX-NEXT: retq
987 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
988 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
989 ret <4 x i32> %2
990}
991
992
993; Test that we correctly combine shuffles according to rule
994; shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
995
996define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
997; SSE-LABEL: combine_nested_undef_test22:
998; SSE: # BB#0:
999; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1000; SSE-NEXT: retq
1001;
1002; AVX-LABEL: combine_nested_undef_test22:
1003; AVX: # BB#0:
1004; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1005; AVX-NEXT: retq
1006 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1007 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
1008 ret <4 x i32> %2
1009}
1010
1011define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
1012; SSE-LABEL: combine_nested_undef_test23:
1013; SSE: # BB#0:
1014; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1015; SSE-NEXT: retq
1016;
1017; AVX-LABEL: combine_nested_undef_test23:
1018; AVX: # BB#0:
1019; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1020; AVX-NEXT: retq
1021 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1022 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1023 ret <4 x i32> %2
1024}
1025
1026define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
1027; SSE-LABEL: combine_nested_undef_test24:
1028; SSE: # BB#0:
1029; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,0]
1030; SSE-NEXT: retq
1031;
1032; AVX-LABEL: combine_nested_undef_test24:
1033; AVX: # BB#0:
1034; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,0]
1035; AVX-NEXT: retq
1036 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1037 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
1038 ret <4 x i32> %2
1039}
1040
1041define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
1042; SSE-LABEL: combine_nested_undef_test25:
1043; SSE: # BB#0:
1044; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1045; SSE-NEXT: retq
1046;
1047; AVX-LABEL: combine_nested_undef_test25:
1048; AVX: # BB#0:
1049; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1050; AVX-NEXT: retq
1051 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
1052 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
1053 ret <4 x i32> %2
1054}
1055
1056define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
1057; SSE-LABEL: combine_nested_undef_test26:
1058; SSE: # BB#0:
1059; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
1060; SSE-NEXT: retq
1061;
1062; AVX-LABEL: combine_nested_undef_test26:
1063; AVX: # BB#0:
1064; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
1065; AVX-NEXT: retq
1066 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
1067 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
1068 ret <4 x i32> %2
1069}
1070
1071define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
1072; SSE-LABEL: combine_nested_undef_test27:
1073; SSE: # BB#0:
1074; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1075; SSE-NEXT: retq
1076;
1077; AVX-LABEL: combine_nested_undef_test27:
1078; AVX: # BB#0:
1079; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1080; AVX-NEXT: retq
1081 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
1082 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
1083 ret <4 x i32> %2
1084}
1085
1086define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
1087; SSE-LABEL: combine_nested_undef_test28:
1088; SSE: # BB#0:
1089; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1090; SSE-NEXT: retq
1091;
1092; AVX-LABEL: combine_nested_undef_test28:
1093; AVX: # BB#0:
1094; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1095; AVX-NEXT: retq
1096 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
1097 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
1098 ret <4 x i32> %2
1099}
Chandler Carruth782b0a72014-10-02 07:56:47 +00001100
1101define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
1102; SSE2-LABEL: combine_test1:
1103; SSE2: # BB#0:
1104; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1105; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1106; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
1107; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1108; SSE2-NEXT: retq
1109;
1110; SSSE3-LABEL: combine_test1:
1111; SSSE3: # BB#0:
1112; SSSE3-NEXT: movaps %xmm1, %xmm0
1113; SSSE3-NEXT: retq
1114;
1115; SSE41-LABEL: combine_test1:
1116; SSE41: # BB#0:
1117; SSE41-NEXT: movaps %xmm1, %xmm0
1118; SSE41-NEXT: retq
1119;
1120; AVX-LABEL: combine_test1:
1121; AVX: # BB#0:
1122; AVX-NEXT: vmovaps %xmm1, %xmm0
1123; AVX-NEXT: retq
1124 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1125 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1126 ret <4 x float> %2
1127}
1128
1129define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
1130; SSE2-LABEL: combine_test2:
1131; SSE2: # BB#0:
1132; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
1133; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1134; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
1135; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
1136; SSE2-NEXT: retq
1137;
1138; SSSE3-LABEL: combine_test2:
1139; SSSE3: # BB#0:
1140; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
1141; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1142; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
1143; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
1144; SSSE3-NEXT: retq
1145;
1146; SSE41-LABEL: combine_test2:
1147; SSE41: # BB#0:
1148; SSE41-NEXT: movss %xmm0, %xmm1
1149; SSE41-NEXT: movaps %xmm1, %xmm0
1150; SSE41-NEXT: retq
1151;
1152; AVX-LABEL: combine_test2:
1153; AVX: # BB#0:
1154; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
1155; AVX-NEXT: retq
1156 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1157 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1158 ret <4 x float> %2
1159}
1160
1161define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
1162; SSE-LABEL: combine_test3:
1163; SSE: # BB#0:
1164; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1165; SSE-NEXT: retq
1166;
1167; AVX-LABEL: combine_test3:
1168; AVX: # BB#0:
1169; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1170; AVX-NEXT: retq
1171 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1172 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1173 ret <4 x float> %2
1174}
1175
1176define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
1177; SSE-LABEL: combine_test4:
1178; SSE: # BB#0:
1179; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1180; SSE-NEXT: retq
1181;
1182; AVX-LABEL: combine_test4:
1183; AVX: # BB#0:
1184; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1185; AVX-NEXT: retq
1186 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1187 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1188 ret <4 x float> %2
1189}
1190
1191define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
1192; SSE2-LABEL: combine_test5:
1193; SSE2: # BB#0:
1194; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1195; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1196; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
1197; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
1198; SSE2-NEXT: retq
1199;
1200; SSSE3-LABEL: combine_test5:
1201; SSSE3: # BB#0:
1202; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1203; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1204; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
1205; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
1206; SSSE3-NEXT: retq
1207;
1208; SSE41-LABEL: combine_test5:
1209; SSE41: # BB#0:
1210; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1211; SSE41-NEXT: retq
1212;
1213; AVX-LABEL: combine_test5:
1214; AVX: # BB#0:
1215; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1216; AVX-NEXT: retq
1217 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1218 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1219 ret <4 x float> %2
1220}
1221
1222define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
1223; SSE2-LABEL: combine_test6:
1224; SSE2: # BB#0:
1225; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1226; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1227; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
1228; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1229; SSE2-NEXT: retq
1230;
1231; SSSE3-LABEL: combine_test6:
1232; SSSE3: # BB#0:
1233; SSSE3-NEXT: movaps %xmm1, %xmm0
1234; SSSE3-NEXT: retq
1235;
1236; SSE41-LABEL: combine_test6:
1237; SSE41: # BB#0:
1238; SSE41-NEXT: movaps %xmm1, %xmm0
1239; SSE41-NEXT: retq
1240;
1241; AVX-LABEL: combine_test6:
1242; AVX: # BB#0:
1243; AVX-NEXT: vmovaps %xmm1, %xmm0
1244; AVX-NEXT: retq
1245 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1246 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1247 ret <4 x i32> %2
1248}
1249
1250define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
1251; SSE2-LABEL: combine_test7:
1252; SSE2: # BB#0:
1253; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
1254; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1255; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
1256; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
1257; SSE2-NEXT: retq
1258;
1259; SSSE3-LABEL: combine_test7:
1260; SSSE3: # BB#0:
1261; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
1262; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1263; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
1264; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
1265; SSSE3-NEXT: retq
1266;
1267; SSE41-LABEL: combine_test7:
1268; SSE41: # BB#0:
1269; SSE41-NEXT: movss %xmm0, %xmm1
1270; SSE41-NEXT: movaps %xmm1, %xmm0
1271; SSE41-NEXT: retq
1272;
1273; AVX-LABEL: combine_test7:
1274; AVX: # BB#0:
1275; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
1276; AVX-NEXT: retq
1277 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1278 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1279 ret <4 x i32> %2
1280}
1281
1282define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
1283; SSE-LABEL: combine_test8:
1284; SSE: # BB#0:
1285; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1286; SSE-NEXT: retq
1287;
1288; AVX-LABEL: combine_test8:
1289; AVX: # BB#0:
1290; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1291; AVX-NEXT: retq
1292 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1293 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1294 ret <4 x i32> %2
1295}
1296
1297define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
1298; SSE-LABEL: combine_test9:
1299; SSE: # BB#0:
1300; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1301; SSE-NEXT: retq
1302;
1303; AVX-LABEL: combine_test9:
1304; AVX: # BB#0:
1305; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1306; AVX-NEXT: retq
1307 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1308 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1309 ret <4 x i32> %2
1310}
1311
1312define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
1313; SSE2-LABEL: combine_test10:
1314; SSE2: # BB#0:
1315; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1316; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1317; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
1318; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
1319; SSE2-NEXT: retq
1320;
1321; SSSE3-LABEL: combine_test10:
1322; SSSE3: # BB#0:
1323; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1324; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1325; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
1326; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
1327; SSSE3-NEXT: retq
1328;
1329; SSE41-LABEL: combine_test10:
1330; SSE41: # BB#0:
1331; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1332; SSE41-NEXT: retq
1333;
1334; AVX1-LABEL: combine_test10:
1335; AVX1: # BB#0:
1336; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1337; AVX1-NEXT: retq
1338;
1339; AVX2-LABEL: combine_test10:
1340; AVX2: # BB#0:
1341; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1342; AVX2-NEXT: retq
1343 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1344 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1345 ret <4 x i32> %2
1346}
1347
1348define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
1349; ALL-LABEL: combine_test11:
1350; ALL: # BB#0:
1351; ALL-NEXT: retq
1352 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1353 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1354 ret <4 x float> %2
1355}
1356
1357define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
1358; SSE2-LABEL: combine_test12:
1359; SSE2: # BB#0:
1360; SSE2-NEXT: movss %xmm0, %xmm1
1361; SSE2-NEXT: movss %xmm0, %xmm1
1362; SSE2-NEXT: movaps %xmm1, %xmm0
1363; SSE2-NEXT: retq
1364;
1365; SSSE3-LABEL: combine_test12:
1366; SSSE3: # BB#0:
1367; SSSE3-NEXT: movss %xmm0, %xmm1
1368; SSSE3-NEXT: movss %xmm0, %xmm1
1369; SSSE3-NEXT: movaps %xmm1, %xmm0
1370; SSSE3-NEXT: retq
1371;
1372; SSE41-LABEL: combine_test12:
1373; SSE41: # BB#0:
1374; SSE41-NEXT: movss %xmm0, %xmm1
1375; SSE41-NEXT: movaps %xmm1, %xmm0
1376; SSE41-NEXT: retq
1377;
1378; AVX-LABEL: combine_test12:
1379; AVX: # BB#0:
1380; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
1381; AVX-NEXT: retq
1382 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1383 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1384 ret <4 x float> %2
1385}
1386
1387define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
1388; SSE-LABEL: combine_test13:
1389; SSE: # BB#0:
1390; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1391; SSE-NEXT: retq
1392;
1393; AVX-LABEL: combine_test13:
1394; AVX: # BB#0:
1395; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1396; AVX-NEXT: retq
1397 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1398 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1399 ret <4 x float> %2
1400}
1401
1402define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
1403; SSE-LABEL: combine_test14:
1404; SSE: # BB#0:
1405; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
1406; SSE-NEXT: movaps %xmm1, %xmm0
1407; SSE-NEXT: retq
1408;
1409; AVX-LABEL: combine_test14:
1410; AVX: # BB#0:
1411; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1412; AVX-NEXT: retq
1413 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1414 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1415 ret <4 x float> %2
1416}
1417
1418define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
1419; SSE2-LABEL: combine_test15:
1420; SSE2: # BB#0:
1421; SSE2-NEXT: movaps %xmm0, %xmm2
1422; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
1423; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
1424; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
1425; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
1426; SSE2-NEXT: retq
1427;
1428; SSSE3-LABEL: combine_test15:
1429; SSSE3: # BB#0:
1430; SSSE3-NEXT: movaps %xmm0, %xmm2
1431; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
1432; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
1433; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
1434; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
1435; SSSE3-NEXT: retq
1436;
1437; SSE41-LABEL: combine_test15:
1438; SSE41: # BB#0:
1439; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1440; SSE41-NEXT: retq
1441;
1442; AVX-LABEL: combine_test15:
1443; AVX: # BB#0:
1444; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1445; AVX-NEXT: retq
1446 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1447 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1448 ret <4 x float> %2
1449}
1450
1451define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
1452; ALL-LABEL: combine_test16:
1453; ALL: # BB#0:
1454; ALL-NEXT: retq
1455 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1456 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1457 ret <4 x i32> %2
1458}
1459
1460define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
1461; SSE2-LABEL: combine_test17:
1462; SSE2: # BB#0:
1463; SSE2-NEXT: movss %xmm0, %xmm1
1464; SSE2-NEXT: movss %xmm0, %xmm1
1465; SSE2-NEXT: movaps %xmm1, %xmm0
1466; SSE2-NEXT: retq
1467;
1468; SSSE3-LABEL: combine_test17:
1469; SSSE3: # BB#0:
1470; SSSE3-NEXT: movss %xmm0, %xmm1
1471; SSSE3-NEXT: movss %xmm0, %xmm1
1472; SSSE3-NEXT: movaps %xmm1, %xmm0
1473; SSSE3-NEXT: retq
1474;
1475; SSE41-LABEL: combine_test17:
1476; SSE41: # BB#0:
1477; SSE41-NEXT: movss %xmm0, %xmm1
1478; SSE41-NEXT: movaps %xmm1, %xmm0
1479; SSE41-NEXT: retq
1480;
1481; AVX-LABEL: combine_test17:
1482; AVX: # BB#0:
1483; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
1484; AVX-NEXT: retq
1485 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1486 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1487 ret <4 x i32> %2
1488}
1489
1490define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
1491; SSE-LABEL: combine_test18:
1492; SSE: # BB#0:
1493; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1494; SSE-NEXT: retq
1495;
1496; AVX-LABEL: combine_test18:
1497; AVX: # BB#0:
1498; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1499; AVX-NEXT: retq
1500 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1501 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1502 ret <4 x i32> %2
1503}
1504
1505define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
1506; SSE-LABEL: combine_test19:
1507; SSE: # BB#0:
1508; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
1509; SSE-NEXT: movaps %xmm1, %xmm0
1510; SSE-NEXT: retq
1511;
1512; AVX-LABEL: combine_test19:
1513; AVX: # BB#0:
1514; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1515; AVX-NEXT: retq
1516 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1517 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1518 ret <4 x i32> %2
1519}
1520
1521define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
1522; SSE2-LABEL: combine_test20:
1523; SSE2: # BB#0:
1524; SSE2-NEXT: movaps %xmm0, %xmm2
1525; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
1526; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
1527; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
1528; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
1529; SSE2-NEXT: retq
1530;
1531; SSSE3-LABEL: combine_test20:
1532; SSSE3: # BB#0:
1533; SSSE3-NEXT: movaps %xmm0, %xmm2
1534; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
1535; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
1536; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
1537; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
1538; SSSE3-NEXT: retq
1539;
1540; SSE41-LABEL: combine_test20:
1541; SSE41: # BB#0:
1542; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1543; SSE41-NEXT: retq
1544;
1545; AVX1-LABEL: combine_test20:
1546; AVX1: # BB#0:
1547; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1548; AVX1-NEXT: retq
1549;
1550; AVX2-LABEL: combine_test20:
1551; AVX2: # BB#0:
1552; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1553; AVX2-NEXT: retq
1554 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1555 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1556 ret <4 x i32> %2
1557}
1558
1559
1560; Check some negative cases.
1561; FIXME: Do any of these really make sense? Are they redundant with the above tests?
1562
1563define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
1564; SSE2-LABEL: combine_test1b:
1565; SSE2: # BB#0:
1566; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1567; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1568; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1569; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0]
1570; SSE2-NEXT: movaps %xmm1, %xmm0
1571; SSE2-NEXT: retq
1572;
1573; SSSE3-LABEL: combine_test1b:
1574; SSSE3: # BB#0:
1575; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1576; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1577; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1578; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0]
1579; SSSE3-NEXT: movaps %xmm1, %xmm0
1580; SSSE3-NEXT: retq
1581;
1582; SSE41-LABEL: combine_test1b:
1583; SSE41: # BB#0:
1584; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1585; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1586; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0]
1587; SSE41-NEXT: movaps %xmm1, %xmm0
1588; SSE41-NEXT: retq
1589;
1590; AVX-LABEL: combine_test1b:
1591; AVX: # BB#0:
1592; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1593; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1594; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,0]
1595; AVX-NEXT: retq
1596 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1597 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
1598 ret <4 x float> %2
1599}
1600
1601define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
1602; SSE2-LABEL: combine_test2b:
1603; SSE2: # BB#0:
1604; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1605; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1606; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1]
1607; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1608; SSE2-NEXT: retq
1609;
1610; SSSE3-LABEL: combine_test2b:
1611; SSSE3: # BB#0:
1612; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1613; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1614; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1]
1615; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1616; SSSE3-NEXT: retq
1617;
1618; SSE41-LABEL: combine_test2b:
1619; SSE41: # BB#0:
1620; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1621; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1]
1622; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1623; SSE41-NEXT: retq
1624;
1625; AVX-LABEL: combine_test2b:
1626; AVX: # BB#0:
1627; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1628; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1]
1629; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1630; AVX-NEXT: retq
1631 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1632 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
1633 ret <4 x float> %2
1634}
1635
1636define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
1637; SSE-LABEL: combine_test3b:
1638; SSE: # BB#0:
1639; SSE-NEXT: movaps %xmm1, %xmm2
1640; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
1641; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2]
1642; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3]
1643; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1644; SSE-NEXT: retq
1645;
1646; AVX-LABEL: combine_test3b:
1647; AVX: # BB#0:
1648; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm0[3,0]
1649; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2]
1650; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3]
1651; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1652; AVX-NEXT: retq
1653 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
1654 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
1655 ret <4 x float> %2
1656}
1657
1658define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
1659; SSE2-LABEL: combine_test4b:
1660; SSE2: # BB#0:
1661; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1662; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1663; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
1664; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2]
1665; SSE2-NEXT: movaps %xmm1, %xmm0
1666; SSE2-NEXT: retq
1667;
1668; SSSE3-LABEL: combine_test4b:
1669; SSSE3: # BB#0:
1670; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1671; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1672; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
1673; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2]
1674; SSSE3-NEXT: movaps %xmm1, %xmm0
1675; SSSE3-NEXT: retq
1676;
1677; SSE41-LABEL: combine_test4b:
1678; SSE41: # BB#0:
1679; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1680; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
1681; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2]
1682; SSE41-NEXT: movaps %xmm1, %xmm0
1683; SSE41-NEXT: retq
1684;
1685; AVX-LABEL: combine_test4b:
1686; AVX: # BB#0:
1687; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1688; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
1689; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[0,2]
1690; AVX-NEXT: retq
1691 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1692 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
1693 ret <4 x float> %2
1694}
1695
1696
1697; Verify that we correctly fold shuffles even when we use illegal vector types.
1698
1699define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
1700; SSE2-LABEL: combine_test1c:
1701; SSE2: # BB#0:
1702; SSE2-NEXT: movl (%rdi), %eax
1703; SSE2-NEXT: movd %eax, %xmm0
1704; SSE2-NEXT: pextrw $1, %xmm0, %ecx
1705; SSE2-NEXT: pinsrw $0, %eax, %xmm0
1706; SSE2-NEXT: movzbl %ah, %eax
1707; SSE2-NEXT: pinsrw $2, %eax, %xmm0
1708; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
1709; SSE2-NEXT: shrl $8, %ecx
1710; SSE2-NEXT: pinsrw $6, %ecx, %xmm0
1711; SSE2-NEXT: movl (%rsi), %eax
1712; SSE2-NEXT: movd %eax, %xmm1
1713; SSE2-NEXT: pextrw $1, %xmm1, %ecx
1714; SSE2-NEXT: pinsrw $0, %eax, %xmm1
1715; SSE2-NEXT: movzbl %ah, %eax
1716; SSE2-NEXT: pinsrw $2, %eax, %xmm1
1717; SSE2-NEXT: pinsrw $4, %ecx, %xmm1
1718; SSE2-NEXT: shrl $8, %ecx
1719; SSE2-NEXT: pinsrw $6, %ecx, %xmm1
1720; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
1721; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1722; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
1723; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
1724; SSE2-NEXT: retq
1725;
1726; SSSE3-LABEL: combine_test1c:
1727; SSSE3: # BB#0:
1728; SSSE3-NEXT: movd (%rdi), %xmm0
1729; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128]
1730; SSSE3-NEXT: pshufb %xmm1, %xmm0
1731; SSSE3-NEXT: movd (%rsi), %xmm2
1732; SSSE3-NEXT: pshufb %xmm1, %xmm2
1733; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
1734; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1735; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
1736; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2]
1737; SSSE3-NEXT: retq
1738;
1739; SSE41-LABEL: combine_test1c:
1740; SSE41: # BB#0:
1741; SSE41-NEXT: pmovzxbd (%rdi), %xmm1
1742; SSE41-NEXT: pmovzxbd (%rsi), %xmm0
1743; SSE41-NEXT: movss %xmm1, %xmm0
1744; SSE41-NEXT: retq
1745;
1746; AVX-LABEL: combine_test1c:
1747; AVX: # BB#0:
1748; AVX-NEXT: vpmovzxbd (%rdi), %xmm0
1749; AVX-NEXT: vpmovzxbd (%rsi), %xmm1
1750; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
1751; AVX-NEXT: retq
1752 %A = load <4 x i8>* %a
1753 %B = load <4 x i8>* %b
1754 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1755 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1756 ret <4 x i8> %2
1757}
1758
1759define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
1760; SSE2-LABEL: combine_test2c:
1761; SSE2: # BB#0:
1762; SSE2-NEXT: movl (%rdi), %eax
1763; SSE2-NEXT: movd %eax, %xmm0
1764; SSE2-NEXT: pextrw $1, %xmm0, %ecx
1765; SSE2-NEXT: pinsrw $0, %eax, %xmm0
1766; SSE2-NEXT: movzbl %ah, %eax
1767; SSE2-NEXT: pinsrw $2, %eax, %xmm0
1768; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
1769; SSE2-NEXT: shrl $8, %ecx
1770; SSE2-NEXT: pinsrw $6, %ecx, %xmm0
1771; SSE2-NEXT: movl (%rsi), %eax
1772; SSE2-NEXT: movd %eax, %xmm1
1773; SSE2-NEXT: pextrw $1, %xmm1, %ecx
1774; SSE2-NEXT: pinsrw $0, %eax, %xmm1
1775; SSE2-NEXT: movzbl %ah, %eax
1776; SSE2-NEXT: pinsrw $2, %eax, %xmm1
1777; SSE2-NEXT: pinsrw $4, %ecx, %xmm1
1778; SSE2-NEXT: shrl $8, %ecx
1779; SSE2-NEXT: pinsrw $6, %ecx, %xmm1
1780; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1781; SSE2-NEXT: retq
1782;
1783; SSSE3-LABEL: combine_test2c:
1784; SSSE3: # BB#0:
1785; SSSE3-NEXT: movd (%rdi), %xmm0
1786; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128]
1787; SSSE3-NEXT: pshufb %xmm1, %xmm0
1788; SSSE3-NEXT: movd (%rsi), %xmm2
1789; SSSE3-NEXT: pshufb %xmm1, %xmm2
1790; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1791; SSSE3-NEXT: retq
1792;
1793; SSE41-LABEL: combine_test2c:
1794; SSE41: # BB#0:
1795; SSE41-NEXT: pmovzxbd (%rdi), %xmm0
1796; SSE41-NEXT: pmovzxbd (%rsi), %xmm1
1797; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1798; SSE41-NEXT: retq
1799;
1800; AVX-LABEL: combine_test2c:
1801; AVX: # BB#0:
1802; AVX-NEXT: vpmovzxbd (%rdi), %xmm0
1803; AVX-NEXT: vpmovzxbd (%rsi), %xmm1
1804; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1805; AVX-NEXT: retq
1806 %A = load <4 x i8>* %a
1807 %B = load <4 x i8>* %b
1808 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
1809 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1810 ret <4 x i8> %2
1811}
1812
1813define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
1814; SSE2-LABEL: combine_test3c:
1815; SSE2: # BB#0:
1816; SSE2-NEXT: movl (%rdi), %eax
1817; SSE2-NEXT: movd %eax, %xmm0
1818; SSE2-NEXT: pextrw $1, %xmm0, %ecx
1819; SSE2-NEXT: pinsrw $0, %eax, %xmm0
1820; SSE2-NEXT: movzbl %ah, %eax
1821; SSE2-NEXT: pinsrw $2, %eax, %xmm0
1822; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
1823; SSE2-NEXT: shrl $8, %ecx
1824; SSE2-NEXT: pinsrw $6, %ecx, %xmm0
1825; SSE2-NEXT: movl (%rsi), %eax
1826; SSE2-NEXT: movd %eax, %xmm1
1827; SSE2-NEXT: pextrw $1, %xmm1, %ecx
1828; SSE2-NEXT: pinsrw $0, %eax, %xmm1
1829; SSE2-NEXT: movzbl %ah, %eax
1830; SSE2-NEXT: pinsrw $2, %eax, %xmm1
1831; SSE2-NEXT: pinsrw $4, %ecx, %xmm1
1832; SSE2-NEXT: shrl $8, %ecx
1833; SSE2-NEXT: pinsrw $6, %ecx, %xmm1
1834; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1835; SSE2-NEXT: retq
1836;
1837; SSSE3-LABEL: combine_test3c:
1838; SSSE3: # BB#0:
1839; SSSE3-NEXT: movd (%rdi), %xmm0
1840; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128]
1841; SSSE3-NEXT: pshufb %xmm1, %xmm0
1842; SSSE3-NEXT: movd (%rsi), %xmm2
1843; SSSE3-NEXT: pshufb %xmm1, %xmm2
1844; SSSE3-NEXT: movhlps {{.*#+}} xmm0 = xmm2[1],xmm0[1]
1845; SSSE3-NEXT: retq
1846;
1847; SSE41-LABEL: combine_test3c:
1848; SSE41: # BB#0:
1849; SSE41-NEXT: pmovzxbd (%rdi), %xmm0
1850; SSE41-NEXT: pmovzxbd (%rsi), %xmm1
1851; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1852; SSE41-NEXT: retq
1853;
1854; AVX-LABEL: combine_test3c:
1855; AVX: # BB#0:
1856; AVX-NEXT: vpmovzxbd (%rdi), %xmm0
1857; AVX-NEXT: vpmovzxbd (%rsi), %xmm1
1858; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1859; AVX-NEXT: retq
1860 %A = load <4 x i8>* %a
1861 %B = load <4 x i8>* %b
1862 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1863 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1864 ret <4 x i8> %2
1865}
1866
1867define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
1868; SSE2-LABEL: combine_test4c:
1869; SSE2: # BB#0:
1870; SSE2-NEXT: movl (%rdi), %eax
1871; SSE2-NEXT: movd %eax, %xmm0
1872; SSE2-NEXT: pextrw $1, %xmm0, %ecx
1873; SSE2-NEXT: pinsrw $0, %eax, %xmm0
1874; SSE2-NEXT: movzbl %ah, %eax
1875; SSE2-NEXT: pinsrw $2, %eax, %xmm0
1876; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
1877; SSE2-NEXT: shrl $8, %ecx
1878; SSE2-NEXT: pinsrw $6, %ecx, %xmm0
1879; SSE2-NEXT: movl (%rsi), %eax
1880; SSE2-NEXT: movd %eax, %xmm1
1881; SSE2-NEXT: pextrw $1, %xmm1, %ecx
1882; SSE2-NEXT: pinsrw $0, %eax, %xmm1
1883; SSE2-NEXT: movzbl %ah, %eax
1884; SSE2-NEXT: pinsrw $2, %eax, %xmm1
1885; SSE2-NEXT: pinsrw $4, %ecx, %xmm1
1886; SSE2-NEXT: shrl $8, %ecx
1887; SSE2-NEXT: pinsrw $6, %ecx, %xmm1
1888; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
1889; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1890; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
1891; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
1892; SSE2-NEXT: retq
1893;
1894; SSSE3-LABEL: combine_test4c:
1895; SSSE3: # BB#0:
1896; SSSE3-NEXT: movd (%rdi), %xmm0
1897; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128]
1898; SSSE3-NEXT: pshufb %xmm1, %xmm0
1899; SSSE3-NEXT: movd (%rsi), %xmm2
1900; SSSE3-NEXT: pshufb %xmm1, %xmm2
1901; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[0,2]
1902; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
1903; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
1904; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1905; SSSE3-NEXT: retq
1906;
1907; SSE41-LABEL: combine_test4c:
1908; SSE41: # BB#0:
1909; SSE41-NEXT: pmovzxbd (%rdi), %xmm0
1910; SSE41-NEXT: pmovzxbd (%rsi), %xmm1
1911; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1912; SSE41-NEXT: retq
1913;
1914; AVX1-LABEL: combine_test4c:
1915; AVX1: # BB#0:
1916; AVX1-NEXT: vpmovzxbd (%rdi), %xmm0
1917; AVX1-NEXT: vpmovzxbd (%rsi), %xmm1
1918; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1919; AVX1-NEXT: retq
1920;
1921; AVX2-LABEL: combine_test4c:
1922; AVX2: # BB#0:
1923; AVX2-NEXT: vpmovzxbd (%rdi), %xmm0
1924; AVX2-NEXT: vpmovzxbd (%rsi), %xmm1
1925; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1926; AVX2-NEXT: retq
1927 %A = load <4 x i8>* %a
1928 %B = load <4 x i8>* %b
1929 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1930 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1931 ret <4 x i8> %2
1932}
1933
1934
1935; The following test cases are generated from this C++ code
1936;
1937;__m128 blend_01(__m128 a, __m128 b)
1938;{
1939; __m128 s = a;
1940; s = _mm_blend_ps( s, b, 1<<0 );
1941; s = _mm_blend_ps( s, b, 1<<1 );
1942; return s;
1943;}
1944;
1945;__m128 blend_02(__m128 a, __m128 b)
1946;{
1947; __m128 s = a;
1948; s = _mm_blend_ps( s, b, 1<<0 );
1949; s = _mm_blend_ps( s, b, 1<<2 );
1950; return s;
1951;}
1952;
1953;__m128 blend_123(__m128 a, __m128 b)
1954;{
1955; __m128 s = a;
1956; s = _mm_blend_ps( s, b, 1<<1 );
1957; s = _mm_blend_ps( s, b, 1<<2 );
1958; s = _mm_blend_ps( s, b, 1<<3 );
1959; return s;
1960;}
1961
1962; Ideally, we should collapse the following shuffles into a single one.
1963
1964define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
1965; SSE2-LABEL: combine_blend_01:
1966; SSE2: # BB#0:
1967; SSE2-NEXT: movss %xmm1, %xmm0
1968; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1969; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
1970; SSE2-NEXT: movaps %xmm1, %xmm0
1971; SSE2-NEXT: retq
1972;
1973; SSSE3-LABEL: combine_blend_01:
1974; SSSE3: # BB#0:
1975; SSSE3-NEXT: movss %xmm1, %xmm0
1976; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1977; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
1978; SSSE3-NEXT: movaps %xmm1, %xmm0
1979; SSSE3-NEXT: retq
1980;
1981; SSE41-LABEL: combine_blend_01:
1982; SSE41: # BB#0:
1983; SSE41-NEXT: movsd %xmm1, %xmm0
1984; SSE41-NEXT: retq
1985;
1986; AVX-LABEL: combine_blend_01:
1987; AVX: # BB#0:
1988; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0
1989; AVX-NEXT: retq
1990 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
1991 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1992 ret <4 x float> %shuffle6
1993}
1994
1995define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
1996; SSE2-LABEL: combine_blend_02:
1997; SSE2: # BB#0:
1998; SSE2-NEXT: movss %xmm1, %xmm0
1999; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
2000; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2001; SSE2-NEXT: retq
2002;
2003; SSSE3-LABEL: combine_blend_02:
2004; SSSE3: # BB#0:
2005; SSSE3-NEXT: movss %xmm1, %xmm0
2006; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
2007; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2008; SSSE3-NEXT: retq
2009;
2010; SSE41-LABEL: combine_blend_02:
2011; SSE41: # BB#0:
2012; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2013; SSE41-NEXT: retq
2014;
2015; AVX-LABEL: combine_blend_02:
2016; AVX: # BB#0:
2017; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2018; AVX-NEXT: retq
2019 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
2020 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
2021 ret <4 x float> %shuffle6
2022}
2023
2024define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
2025; SSE2-LABEL: combine_blend_123:
2026; SSE2: # BB#0:
2027; SSE2-NEXT: movaps %xmm1, %xmm2
2028; SSE2-NEXT: movss %xmm0, %xmm2
2029; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
2030; SSE2-NEXT: movaps %xmm2, %xmm0
2031; SSE2-NEXT: retq
2032;
2033; SSSE3-LABEL: combine_blend_123:
2034; SSSE3: # BB#0:
2035; SSSE3-NEXT: movaps %xmm1, %xmm2
2036; SSSE3-NEXT: movss %xmm0, %xmm2
2037; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
2038; SSSE3-NEXT: movaps %xmm2, %xmm0
2039; SSSE3-NEXT: retq
2040;
2041; SSE41-LABEL: combine_blend_123:
2042; SSE41: # BB#0:
2043; SSE41-NEXT: movss %xmm0, %xmm1
2044; SSE41-NEXT: movaps %xmm1, %xmm0
2045; SSE41-NEXT: retq
2046;
2047; AVX-LABEL: combine_blend_123:
2048; AVX: # BB#0:
2049; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
2050; AVX-NEXT: retq
2051 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
2052 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
2053 %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2054 ret <4 x float> %shuffle12
2055}
2056
2057define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
2058; SSE-LABEL: combine_test_movhl_1:
2059; SSE: # BB#0:
2060; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2061; SSE-NEXT: retq
2062;
2063; AVX-LABEL: combine_test_movhl_1:
2064; AVX: # BB#0:
2065; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2066; AVX-NEXT: retq
2067 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
2068 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
2069 ret <4 x i32> %2
2070}
2071
2072define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
2073; SSE-LABEL: combine_test_movhl_2:
2074; SSE: # BB#0:
2075; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2076; SSE-NEXT: retq
2077;
2078; AVX-LABEL: combine_test_movhl_2:
2079; AVX: # BB#0:
2080; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2081; AVX-NEXT: retq
2082 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
2083 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
2084 ret <4 x i32> %2
2085}
2086
2087define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
2088; SSE-LABEL: combine_test_movhl_3:
2089; SSE: # BB#0:
2090; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2091; SSE-NEXT: retq
2092;
2093; AVX-LABEL: combine_test_movhl_3:
2094; AVX: # BB#0:
2095; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2096; AVX-NEXT: retq
2097 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
2098 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
2099 ret <4 x i32> %2
2100}