blob: 690707b687078a204b78ade7821bbd3a3da376eb [file] [log] [blame]
Simon Pilgrim569106f2016-01-03 17:14:15 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
5
6define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
7; SSE-LABEL: shuffle_v4f32_0z27:
8; SSE: # BB#0:
Simon Pilgrime74653b2016-01-19 22:24:12 +00009; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
Simon Pilgrim569106f2016-01-03 17:14:15 +000010; SSE-NEXT: retq
11;
12; AVX-LABEL: shuffle_v4f32_0z27:
13; AVX: # BB#0:
Simon Pilgrime74653b2016-01-19 22:24:12 +000014; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
Simon Pilgrim569106f2016-01-03 17:14:15 +000015; AVX-NEXT: retq
16 %vecext = extractelement <4 x float> %x, i32 0
17 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
18 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
19 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
20 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
21 ret <4 x float> %vecinit5
22}
23
24define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %xyzw, <4 x float> %abcd) {
25; SSE-LABEL: shuffle_v4f32_0zz4:
26; SSE: # BB#0:
27; SSE-NEXT: xorps %xmm2, %xmm2
28; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
29; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
30; SSE-NEXT: retq
31;
32; AVX-LABEL: shuffle_v4f32_0zz4:
33; AVX: # BB#0:
34; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
35; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
36; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
37; AVX-NEXT: retq
38 %vecext = extractelement <4 x float> %xyzw, i32 0
39 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
40 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
41 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2
42 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %abcd, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
43 ret <4 x float> %vecinit4
44}
45
46define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
47; SSE-LABEL: shuffle_v4f32_0z24:
48; SSE: # BB#0:
Simon Pilgrime74653b2016-01-19 22:24:12 +000049; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
Simon Pilgrim569106f2016-01-03 17:14:15 +000050; SSE-NEXT: retq
51;
52; AVX-LABEL: shuffle_v4f32_0z24:
53; AVX: # BB#0:
Simon Pilgrime74653b2016-01-19 22:24:12 +000054; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
Simon Pilgrim569106f2016-01-03 17:14:15 +000055; AVX-NEXT: retq
56 %vecext = extractelement <4 x float> %xyzw, i32 0
57 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
58 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
59 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %xyzw, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
60 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %abcd, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
61 ret <4 x float> %vecinit5
62}
63
64define <4 x float> @shuffle_v4f32_0zz0(float %a) {
65; SSE-LABEL: shuffle_v4f32_0zz0:
66; SSE: # BB#0:
67; SSE-NEXT: xorps %xmm1, %xmm1
68; SSE-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
69; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,0]
70; SSE-NEXT: movaps %xmm1, %xmm0
71; SSE-NEXT: retq
72;
73; AVX-LABEL: shuffle_v4f32_0zz0:
74; AVX: # BB#0:
75; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
76; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
77; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
78; AVX-NEXT: retq
79 %vecinit = insertelement <4 x float> undef, float %a, i32 0
80 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
81 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2
82 %vecinit3 = insertelement <4 x float> %vecinit2, float %a, i32 3
83 ret <4 x float> %vecinit3
84}
85
86define <4 x float> @shuffle_v4f32_0z6z(<4 x float> %A, <4 x float> %B) {
87; SSE-LABEL: shuffle_v4f32_0z6z:
88; SSE: # BB#0:
89; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
90; SSE-NEXT: retq
91;
92; AVX-LABEL: shuffle_v4f32_0z6z:
93; AVX: # BB#0:
94; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
95; AVX-NEXT: retq
96 %vecext = extractelement <4 x float> %A, i32 0
97 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
98 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
99 %vecext2 = extractelement <4 x float> %B, i32 2
100 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
101 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
102 ret <4 x float> %vecinit4
103}
Simon Pilgrim83e44c62016-01-07 10:24:19 +0000104
105define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) {
106; SSE-LABEL: extract_zero_insertps_z0z7:
107; SSE: # BB#0:
108; SSE-NEXT: xorps %xmm0, %xmm0
109; SSE-NEXT: retq
110;
111; AVX-LABEL: extract_zero_insertps_z0z7:
112; AVX: # BB#0:
113; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
114; AVX-NEXT: retq
115 %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 21)
116 %ext = extractelement <4 x float> %res, i32 0
117 ret float %ext
118}
119
120define float @extract_lane_insertps_5123(<4 x float> %a0, <4 x float> *%p1) {
121; SSE-LABEL: extract_lane_insertps_5123:
122; SSE: # BB#0:
123; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
124; SSE-NEXT: retq
125;
126; AVX-LABEL: extract_lane_insertps_5123:
127; AVX: # BB#0:
128; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
129; AVX-NEXT: retq
130 %a1 = load <4 x float>, <4 x float> *%p1
131 %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128)
132 %ext = extractelement <4 x float> %res, i32 0
133 ret float %ext
134}
135
136declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone