blob: 5b68c8003c1b722ae309ee9e21329ab6a8dcf7ff [file] [log] [blame]
Simon Pilgrim14566ea2018-04-09 16:01:44 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
8
9;
10; vXf32 (accum)
11;
12
13define float @test_v2f32(float %a0, <2 x float> %a1) {
14; SSE2-LABEL: test_v2f32:
15; SSE2: # %bb.0:
16; SSE2-NEXT: mulss %xmm1, %xmm0
17; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
18; SSE2-NEXT: mulss %xmm1, %xmm0
19; SSE2-NEXT: retq
20;
21; SSE41-LABEL: test_v2f32:
22; SSE41: # %bb.0:
23; SSE41-NEXT: mulss %xmm1, %xmm0
24; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
25; SSE41-NEXT: mulss %xmm1, %xmm0
26; SSE41-NEXT: retq
27;
28; AVX-LABEL: test_v2f32:
29; AVX: # %bb.0:
30; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
31; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
32; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
33; AVX-NEXT: retq
34;
35; AVX512-LABEL: test_v2f32:
36; AVX512: # %bb.0:
37; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
38; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
39; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
40; AVX512-NEXT: retq
41 %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
42 ret float %1
43}
44
45define float @test_v4f32(float %a0, <4 x float> %a1) {
46; SSE2-LABEL: test_v4f32:
47; SSE2: # %bb.0:
48; SSE2-NEXT: mulss %xmm1, %xmm0
49; SSE2-NEXT: movaps %xmm1, %xmm2
50; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
51; SSE2-NEXT: mulss %xmm2, %xmm0
52; SSE2-NEXT: movaps %xmm1, %xmm2
53; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
54; SSE2-NEXT: mulss %xmm2, %xmm0
55; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
56; SSE2-NEXT: mulss %xmm1, %xmm0
57; SSE2-NEXT: retq
58;
59; SSE41-LABEL: test_v4f32:
60; SSE41: # %bb.0:
61; SSE41-NEXT: mulss %xmm1, %xmm0
62; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
63; SSE41-NEXT: mulss %xmm2, %xmm0
64; SSE41-NEXT: movaps %xmm1, %xmm2
65; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
66; SSE41-NEXT: mulss %xmm2, %xmm0
67; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
68; SSE41-NEXT: mulss %xmm1, %xmm0
69; SSE41-NEXT: retq
70;
71; AVX-LABEL: test_v4f32:
72; AVX: # %bb.0:
73; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
74; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
75; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
76; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
77; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
78; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
79; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
80; AVX-NEXT: retq
81;
82; AVX512-LABEL: test_v4f32:
83; AVX512: # %bb.0:
84; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
85; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
86; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
87; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
88; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
89; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
90; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
91; AVX512-NEXT: retq
92 %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
93 ret float %1
94}
95
96define float @test_v8f32(float %a0, <8 x float> %a1) {
97; SSE2-LABEL: test_v8f32:
98; SSE2: # %bb.0:
99; SSE2-NEXT: mulss %xmm1, %xmm0
100; SSE2-NEXT: movaps %xmm1, %xmm3
101; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[2,3]
102; SSE2-NEXT: mulss %xmm3, %xmm0
103; SSE2-NEXT: movaps %xmm1, %xmm3
104; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
105; SSE2-NEXT: mulss %xmm3, %xmm0
106; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
107; SSE2-NEXT: mulss %xmm1, %xmm0
108; SSE2-NEXT: mulss %xmm2, %xmm0
109; SSE2-NEXT: movaps %xmm2, %xmm1
110; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
111; SSE2-NEXT: mulss %xmm1, %xmm0
112; SSE2-NEXT: movaps %xmm2, %xmm1
113; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
114; SSE2-NEXT: mulss %xmm1, %xmm0
115; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
116; SSE2-NEXT: mulss %xmm2, %xmm0
117; SSE2-NEXT: retq
118;
119; SSE41-LABEL: test_v8f32:
120; SSE41: # %bb.0:
121; SSE41-NEXT: mulss %xmm1, %xmm0
122; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
123; SSE41-NEXT: mulss %xmm3, %xmm0
124; SSE41-NEXT: movaps %xmm1, %xmm3
125; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
126; SSE41-NEXT: mulss %xmm3, %xmm0
127; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
128; SSE41-NEXT: mulss %xmm1, %xmm0
129; SSE41-NEXT: mulss %xmm2, %xmm0
130; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
131; SSE41-NEXT: mulss %xmm1, %xmm0
132; SSE41-NEXT: movaps %xmm2, %xmm1
133; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
134; SSE41-NEXT: mulss %xmm1, %xmm0
135; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
136; SSE41-NEXT: mulss %xmm2, %xmm0
137; SSE41-NEXT: retq
138;
139; AVX-LABEL: test_v8f32:
140; AVX: # %bb.0:
141; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
142; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
143; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
144; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
145; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
146; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
147; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
148; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
149; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
150; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
151; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
152; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
153; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
154; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
155; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
156; AVX-NEXT: vzeroupper
157; AVX-NEXT: retq
158;
159; AVX512-LABEL: test_v8f32:
160; AVX512: # %bb.0:
161; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
162; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
163; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
164; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
165; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
166; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
167; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
168; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
169; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
170; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
171; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
172; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
173; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
174; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
175; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
176; AVX512-NEXT: vzeroupper
177; AVX512-NEXT: retq
178 %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
179 ret float %1
180}
181
182define float @test_v16f32(float %a0, <16 x float> %a1) {
183; SSE2-LABEL: test_v16f32:
184; SSE2: # %bb.0:
185; SSE2-NEXT: mulss %xmm1, %xmm0
186; SSE2-NEXT: movaps %xmm1, %xmm5
187; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[2,3]
188; SSE2-NEXT: mulss %xmm5, %xmm0
189; SSE2-NEXT: movaps %xmm1, %xmm5
190; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1]
191; SSE2-NEXT: mulss %xmm5, %xmm0
192; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
193; SSE2-NEXT: mulss %xmm1, %xmm0
194; SSE2-NEXT: mulss %xmm2, %xmm0
195; SSE2-NEXT: movaps %xmm2, %xmm1
196; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
197; SSE2-NEXT: mulss %xmm1, %xmm0
198; SSE2-NEXT: movaps %xmm2, %xmm1
199; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
200; SSE2-NEXT: mulss %xmm1, %xmm0
201; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
202; SSE2-NEXT: mulss %xmm2, %xmm0
203; SSE2-NEXT: mulss %xmm3, %xmm0
204; SSE2-NEXT: movaps %xmm3, %xmm1
205; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
206; SSE2-NEXT: mulss %xmm1, %xmm0
207; SSE2-NEXT: movaps %xmm3, %xmm1
208; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
209; SSE2-NEXT: mulss %xmm1, %xmm0
210; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
211; SSE2-NEXT: mulss %xmm3, %xmm0
212; SSE2-NEXT: mulss %xmm4, %xmm0
213; SSE2-NEXT: movaps %xmm4, %xmm1
214; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[2,3]
215; SSE2-NEXT: mulss %xmm1, %xmm0
216; SSE2-NEXT: movaps %xmm4, %xmm1
217; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1]
218; SSE2-NEXT: mulss %xmm1, %xmm0
219; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1,2,3]
220; SSE2-NEXT: mulss %xmm4, %xmm0
221; SSE2-NEXT: retq
222;
223; SSE41-LABEL: test_v16f32:
224; SSE41: # %bb.0:
225; SSE41-NEXT: mulss %xmm1, %xmm0
226; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
227; SSE41-NEXT: mulss %xmm5, %xmm0
228; SSE41-NEXT: movaps %xmm1, %xmm5
229; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1]
230; SSE41-NEXT: mulss %xmm5, %xmm0
231; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
232; SSE41-NEXT: mulss %xmm1, %xmm0
233; SSE41-NEXT: mulss %xmm2, %xmm0
234; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
235; SSE41-NEXT: mulss %xmm1, %xmm0
236; SSE41-NEXT: movaps %xmm2, %xmm1
237; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
238; SSE41-NEXT: mulss %xmm1, %xmm0
239; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
240; SSE41-NEXT: mulss %xmm2, %xmm0
241; SSE41-NEXT: mulss %xmm3, %xmm0
242; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
243; SSE41-NEXT: mulss %xmm1, %xmm0
244; SSE41-NEXT: movaps %xmm3, %xmm1
245; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
246; SSE41-NEXT: mulss %xmm1, %xmm0
247; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
248; SSE41-NEXT: mulss %xmm3, %xmm0
249; SSE41-NEXT: mulss %xmm4, %xmm0
250; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
251; SSE41-NEXT: mulss %xmm1, %xmm0
252; SSE41-NEXT: movaps %xmm4, %xmm1
253; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1]
254; SSE41-NEXT: mulss %xmm1, %xmm0
255; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1,2,3]
256; SSE41-NEXT: mulss %xmm4, %xmm0
257; SSE41-NEXT: retq
258;
259; AVX-LABEL: test_v16f32:
260; AVX: # %bb.0:
261; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
262; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
263; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0
264; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
265; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0
266; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
267; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0
268; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
269; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
270; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
271; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0
272; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
273; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0
274; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
275; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
276; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
277; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
278; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
279; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
280; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
281; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
282; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
283; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
284; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
285; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
286; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
287; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
288; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
289; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
290; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
291; AVX-NEXT: vzeroupper
292; AVX-NEXT: retq
293;
294; AVX512-LABEL: test_v16f32:
295; AVX512: # %bb.0:
296; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
297; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
298; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
299; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
300; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
301; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
302; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
303; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
304; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
305; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
306; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0
307; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
308; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0
309; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
310; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
311; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
312; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
313; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
314; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0
315; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
316; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0
317; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
318; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
319; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
320; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
321; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
322; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
323; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
324; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
325; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
326; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
327; AVX512-NEXT: vzeroupper
328; AVX512-NEXT: retq
329 %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
330 ret float %1
331}
332
333;
334; vXf32 (one)
335;
336
337define float @test_v2f32_one(<2 x float> %a0) {
338; SSE2-LABEL: test_v2f32_one:
339; SSE2: # %bb.0:
340; SSE2-NEXT: movaps %xmm0, %xmm1
341; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
342; SSE2-NEXT: mulss %xmm0, %xmm1
343; SSE2-NEXT: movaps %xmm1, %xmm0
344; SSE2-NEXT: retq
345;
346; SSE41-LABEL: test_v2f32_one:
347; SSE41: # %bb.0:
348; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
349; SSE41-NEXT: mulss %xmm1, %xmm0
350; SSE41-NEXT: retq
351;
352; AVX-LABEL: test_v2f32_one:
353; AVX: # %bb.0:
354; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
355; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
356; AVX-NEXT: retq
357;
358; AVX512-LABEL: test_v2f32_one:
359; AVX512: # %bb.0:
360; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
361; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
362; AVX512-NEXT: retq
363 %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
364 ret float %1
365}
366
367define float @test_v4f32_one(<4 x float> %a0) {
368; SSE2-LABEL: test_v4f32_one:
369; SSE2: # %bb.0:
370; SSE2-NEXT: movaps %xmm0, %xmm1
371; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
372; SSE2-NEXT: mulss %xmm0, %xmm1
373; SSE2-NEXT: movaps %xmm0, %xmm2
374; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
375; SSE2-NEXT: mulss %xmm1, %xmm2
376; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
377; SSE2-NEXT: mulss %xmm2, %xmm0
378; SSE2-NEXT: retq
379;
380; SSE41-LABEL: test_v4f32_one:
381; SSE41: # %bb.0:
382; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
383; SSE41-NEXT: mulss %xmm0, %xmm1
384; SSE41-NEXT: movaps %xmm0, %xmm2
385; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
386; SSE41-NEXT: mulss %xmm1, %xmm2
387; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
388; SSE41-NEXT: mulss %xmm2, %xmm0
389; SSE41-NEXT: retq
390;
391; AVX-LABEL: test_v4f32_one:
392; AVX: # %bb.0:
393; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
394; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1
395; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
396; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
397; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
398; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
399; AVX-NEXT: retq
400;
401; AVX512-LABEL: test_v4f32_one:
402; AVX512: # %bb.0:
403; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
404; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1
405; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
406; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
407; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
408; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
409; AVX512-NEXT: retq
410 %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
411 ret float %1
412}
413
414define float @test_v8f32_one(<8 x float> %a0) {
415; SSE2-LABEL: test_v8f32_one:
416; SSE2: # %bb.0:
417; SSE2-NEXT: movaps %xmm0, %xmm2
418; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
419; SSE2-NEXT: mulss %xmm0, %xmm2
420; SSE2-NEXT: movaps %xmm0, %xmm3
421; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
422; SSE2-NEXT: mulss %xmm2, %xmm3
423; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
424; SSE2-NEXT: mulss %xmm3, %xmm0
425; SSE2-NEXT: mulss %xmm1, %xmm0
426; SSE2-NEXT: movaps %xmm1, %xmm2
427; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
428; SSE2-NEXT: mulss %xmm2, %xmm0
429; SSE2-NEXT: movaps %xmm1, %xmm2
430; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
431; SSE2-NEXT: mulss %xmm2, %xmm0
432; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
433; SSE2-NEXT: mulss %xmm1, %xmm0
434; SSE2-NEXT: retq
435;
436; SSE41-LABEL: test_v8f32_one:
437; SSE41: # %bb.0:
438; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
439; SSE41-NEXT: mulss %xmm0, %xmm2
440; SSE41-NEXT: movaps %xmm0, %xmm3
441; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
442; SSE41-NEXT: mulss %xmm2, %xmm3
443; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
444; SSE41-NEXT: mulss %xmm3, %xmm0
445; SSE41-NEXT: mulss %xmm1, %xmm0
446; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
447; SSE41-NEXT: mulss %xmm2, %xmm0
448; SSE41-NEXT: movaps %xmm1, %xmm2
449; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
450; SSE41-NEXT: mulss %xmm2, %xmm0
451; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
452; SSE41-NEXT: mulss %xmm1, %xmm0
453; SSE41-NEXT: retq
454;
455; AVX-LABEL: test_v8f32_one:
456; AVX: # %bb.0:
457; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
458; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1
459; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
460; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
461; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
462; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
463; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
464; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1
465; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
466; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
467; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
468; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
469; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
470; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
471; AVX-NEXT: vzeroupper
472; AVX-NEXT: retq
473;
474; AVX512-LABEL: test_v8f32_one:
475; AVX512: # %bb.0:
476; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
477; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1
478; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
479; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
480; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
481; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
482; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
483; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1
484; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
485; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
486; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
487; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
488; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
489; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
490; AVX512-NEXT: vzeroupper
491; AVX512-NEXT: retq
492 %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
493 ret float %1
494}
495
496define float @test_v16f32_one(<16 x float> %a0) {
497; SSE2-LABEL: test_v16f32_one:
498; SSE2: # %bb.0:
499; SSE2-NEXT: movaps %xmm0, %xmm4
500; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[2,3]
501; SSE2-NEXT: mulss %xmm0, %xmm4
502; SSE2-NEXT: movaps %xmm0, %xmm5
503; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
504; SSE2-NEXT: mulss %xmm4, %xmm5
505; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
506; SSE2-NEXT: mulss %xmm5, %xmm0
507; SSE2-NEXT: mulss %xmm1, %xmm0
508; SSE2-NEXT: movaps %xmm1, %xmm4
509; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3]
510; SSE2-NEXT: mulss %xmm4, %xmm0
511; SSE2-NEXT: movaps %xmm1, %xmm4
512; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
513; SSE2-NEXT: mulss %xmm4, %xmm0
514; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
515; SSE2-NEXT: mulss %xmm1, %xmm0
516; SSE2-NEXT: mulss %xmm2, %xmm0
517; SSE2-NEXT: movaps %xmm2, %xmm1
518; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
519; SSE2-NEXT: mulss %xmm1, %xmm0
520; SSE2-NEXT: movaps %xmm2, %xmm1
521; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
522; SSE2-NEXT: mulss %xmm1, %xmm0
523; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
524; SSE2-NEXT: mulss %xmm2, %xmm0
525; SSE2-NEXT: mulss %xmm3, %xmm0
526; SSE2-NEXT: movaps %xmm3, %xmm1
527; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
528; SSE2-NEXT: mulss %xmm1, %xmm0
529; SSE2-NEXT: movaps %xmm3, %xmm1
530; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
531; SSE2-NEXT: mulss %xmm1, %xmm0
532; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
533; SSE2-NEXT: mulss %xmm3, %xmm0
534; SSE2-NEXT: retq
535;
536; SSE41-LABEL: test_v16f32_one:
537; SSE41: # %bb.0:
538; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
539; SSE41-NEXT: mulss %xmm0, %xmm4
540; SSE41-NEXT: movaps %xmm0, %xmm5
541; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
542; SSE41-NEXT: mulss %xmm4, %xmm5
543; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
544; SSE41-NEXT: mulss %xmm5, %xmm0
545; SSE41-NEXT: mulss %xmm1, %xmm0
546; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
547; SSE41-NEXT: mulss %xmm4, %xmm0
548; SSE41-NEXT: movaps %xmm1, %xmm4
549; SSE41-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
550; SSE41-NEXT: mulss %xmm4, %xmm0
551; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
552; SSE41-NEXT: mulss %xmm1, %xmm0
553; SSE41-NEXT: mulss %xmm2, %xmm0
554; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
555; SSE41-NEXT: mulss %xmm1, %xmm0
556; SSE41-NEXT: movaps %xmm2, %xmm1
557; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
558; SSE41-NEXT: mulss %xmm1, %xmm0
559; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
560; SSE41-NEXT: mulss %xmm2, %xmm0
561; SSE41-NEXT: mulss %xmm3, %xmm0
562; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
563; SSE41-NEXT: mulss %xmm1, %xmm0
564; SSE41-NEXT: movaps %xmm3, %xmm1
565; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
566; SSE41-NEXT: mulss %xmm1, %xmm0
567; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
568; SSE41-NEXT: mulss %xmm3, %xmm0
569; SSE41-NEXT: retq
570;
571; AVX-LABEL: test_v16f32_one:
572; AVX: # %bb.0:
573; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
574; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm2
575; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
576; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
577; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
578; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
579; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
580; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2
581; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
582; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
583; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
584; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
585; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
586; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0
587; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
588; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
589; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
590; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
591; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
592; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
593; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
594; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
595; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
596; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
597; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
598; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
599; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
600; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
601; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
602; AVX-NEXT: vzeroupper
603; AVX-NEXT: retq
604;
605; AVX512-LABEL: test_v16f32_one:
606; AVX512: # %bb.0:
607; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
608; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1
609; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
610; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
611; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
612; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
613; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
614; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
615; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
616; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
617; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
618; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
619; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
620; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
621; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
622; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
623; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
624; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
625; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
626; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
627; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
628; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
629; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
630; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1
631; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
632; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
633; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
634; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
635; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
636; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
637; AVX512-NEXT: vzeroupper
638; AVX512-NEXT: retq
639 %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
640 ret float %1
641}
642
643;
644; vXf32 (undef)
645;
646
647define float @test_v2f32_undef(<2 x float> %a0) {
648; SSE2-LABEL: test_v2f32_undef:
649; SSE2: # %bb.0:
650; SSE2-NEXT: movaps %xmm0, %xmm1
651; SSE2-NEXT: mulss %xmm0, %xmm1
652; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
653; SSE2-NEXT: mulss %xmm1, %xmm0
654; SSE2-NEXT: retq
655;
656; SSE41-LABEL: test_v2f32_undef:
657; SSE41: # %bb.0:
658; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
659; SSE41-NEXT: mulss %xmm0, %xmm0
660; SSE41-NEXT: mulss %xmm1, %xmm0
661; SSE41-NEXT: retq
662;
663; AVX-LABEL: test_v2f32_undef:
664; AVX: # %bb.0:
665; AVX-NEXT: vmulss %xmm0, %xmm0, %xmm1
666; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
667; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
668; AVX-NEXT: retq
669;
670; AVX512-LABEL: test_v2f32_undef:
671; AVX512: # %bb.0:
672; AVX512-NEXT: vmulss %xmm0, %xmm0, %xmm1
673; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
674; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
675; AVX512-NEXT: retq
676 %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
677 ret float %1
678}
679
680define float @test_v4f32_undef(<4 x float> %a0) {
681; SSE2-LABEL: test_v4f32_undef:
682; SSE2: # %bb.0:
683; SSE2-NEXT: movaps %xmm0, %xmm1
684; SSE2-NEXT: mulss %xmm0, %xmm1
685; SSE2-NEXT: movaps %xmm0, %xmm2
686; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
687; SSE2-NEXT: mulss %xmm1, %xmm2
688; SSE2-NEXT: movaps %xmm0, %xmm1
689; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
690; SSE2-NEXT: mulss %xmm2, %xmm1
691; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
692; SSE2-NEXT: mulss %xmm1, %xmm0
693; SSE2-NEXT: retq
694;
695; SSE41-LABEL: test_v4f32_undef:
696; SSE41: # %bb.0:
697; SSE41-NEXT: movaps %xmm0, %xmm1
698; SSE41-NEXT: mulss %xmm0, %xmm1
699; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
700; SSE41-NEXT: mulss %xmm2, %xmm1
701; SSE41-NEXT: movaps %xmm0, %xmm2
702; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
703; SSE41-NEXT: mulss %xmm1, %xmm2
704; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
705; SSE41-NEXT: mulss %xmm2, %xmm0
706; SSE41-NEXT: retq
707;
708; AVX-LABEL: test_v4f32_undef:
709; AVX: # %bb.0:
710; AVX-NEXT: vmulss %xmm0, %xmm0, %xmm1
711; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
712; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
713; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
714; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
715; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
716; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
717; AVX-NEXT: retq
718;
719; AVX512-LABEL: test_v4f32_undef:
720; AVX512: # %bb.0:
721; AVX512-NEXT: vmulss %xmm0, %xmm0, %xmm1
722; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
723; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
724; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
725; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
726; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
727; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
728; AVX512-NEXT: retq
729 %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
730 ret float %1
731}
732
733define float @test_v8f32_undef(<8 x float> %a0) {
734; SSE2-LABEL: test_v8f32_undef:
735; SSE2: # %bb.0:
736; SSE2-NEXT: movaps %xmm0, %xmm2
737; SSE2-NEXT: mulss %xmm0, %xmm2
738; SSE2-NEXT: movaps %xmm0, %xmm3
739; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
740; SSE2-NEXT: mulss %xmm2, %xmm3
741; SSE2-NEXT: movaps %xmm0, %xmm2
742; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
743; SSE2-NEXT: mulss %xmm3, %xmm2
744; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
745; SSE2-NEXT: mulss %xmm2, %xmm0
746; SSE2-NEXT: mulss %xmm1, %xmm0
747; SSE2-NEXT: movaps %xmm1, %xmm2
748; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
749; SSE2-NEXT: mulss %xmm2, %xmm0
750; SSE2-NEXT: movaps %xmm1, %xmm2
751; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
752; SSE2-NEXT: mulss %xmm2, %xmm0
753; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
754; SSE2-NEXT: mulss %xmm1, %xmm0
755; SSE2-NEXT: retq
756;
757; SSE41-LABEL: test_v8f32_undef:
758; SSE41: # %bb.0:
759; SSE41-NEXT: movaps %xmm0, %xmm2
760; SSE41-NEXT: mulss %xmm0, %xmm2
761; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
762; SSE41-NEXT: mulss %xmm3, %xmm2
763; SSE41-NEXT: movaps %xmm0, %xmm3
764; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
765; SSE41-NEXT: mulss %xmm2, %xmm3
766; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
767; SSE41-NEXT: mulss %xmm3, %xmm0
768; SSE41-NEXT: mulss %xmm1, %xmm0
769; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
770; SSE41-NEXT: mulss %xmm2, %xmm0
771; SSE41-NEXT: movaps %xmm1, %xmm2
772; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
773; SSE41-NEXT: mulss %xmm2, %xmm0
774; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
775; SSE41-NEXT: mulss %xmm1, %xmm0
776; SSE41-NEXT: retq
777;
778; AVX-LABEL: test_v8f32_undef:
779; AVX: # %bb.0:
780; AVX-NEXT: vmulss %xmm0, %xmm0, %xmm1
781; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
782; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
783; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
784; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
785; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
786; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
787; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
788; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1
789; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
790; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
791; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
792; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
793; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
794; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
795; AVX-NEXT: vzeroupper
796; AVX-NEXT: retq
797;
798; AVX512-LABEL: test_v8f32_undef:
799; AVX512: # %bb.0:
800; AVX512-NEXT: vmulss %xmm0, %xmm0, %xmm1
801; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
802; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
803; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
804; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
805; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
806; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
807; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
808; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1
809; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
810; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
811; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
812; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
813; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
814; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
815; AVX512-NEXT: vzeroupper
816; AVX512-NEXT: retq
817 %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
818 ret float %1
819}
820
821define float @test_v16f32_undef(<16 x float> %a0) {
822; SSE2-LABEL: test_v16f32_undef:
823; SSE2: # %bb.0:
824; SSE2-NEXT: movaps %xmm0, %xmm4
825; SSE2-NEXT: mulss %xmm0, %xmm4
826; SSE2-NEXT: movaps %xmm0, %xmm5
827; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[2,3]
828; SSE2-NEXT: mulss %xmm4, %xmm5
829; SSE2-NEXT: movaps %xmm0, %xmm4
830; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1]
831; SSE2-NEXT: mulss %xmm5, %xmm4
832; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
833; SSE2-NEXT: mulss %xmm4, %xmm0
834; SSE2-NEXT: mulss %xmm1, %xmm0
835; SSE2-NEXT: movaps %xmm1, %xmm4
836; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3]
837; SSE2-NEXT: mulss %xmm4, %xmm0
838; SSE2-NEXT: movaps %xmm1, %xmm4
839; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
840; SSE2-NEXT: mulss %xmm4, %xmm0
841; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
842; SSE2-NEXT: mulss %xmm1, %xmm0
843; SSE2-NEXT: mulss %xmm2, %xmm0
844; SSE2-NEXT: movaps %xmm2, %xmm1
845; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
846; SSE2-NEXT: mulss %xmm1, %xmm0
847; SSE2-NEXT: movaps %xmm2, %xmm1
848; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
849; SSE2-NEXT: mulss %xmm1, %xmm0
850; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
851; SSE2-NEXT: mulss %xmm2, %xmm0
852; SSE2-NEXT: mulss %xmm3, %xmm0
853; SSE2-NEXT: movaps %xmm3, %xmm1
854; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
855; SSE2-NEXT: mulss %xmm1, %xmm0
856; SSE2-NEXT: movaps %xmm3, %xmm1
857; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
858; SSE2-NEXT: mulss %xmm1, %xmm0
859; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
860; SSE2-NEXT: mulss %xmm3, %xmm0
861; SSE2-NEXT: retq
862;
863; SSE41-LABEL: test_v16f32_undef:
864; SSE41: # %bb.0:
865; SSE41-NEXT: movaps %xmm0, %xmm4
866; SSE41-NEXT: mulss %xmm0, %xmm4
867; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
868; SSE41-NEXT: mulss %xmm5, %xmm4
869; SSE41-NEXT: movaps %xmm0, %xmm5
870; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
871; SSE41-NEXT: mulss %xmm4, %xmm5
872; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
873; SSE41-NEXT: mulss %xmm5, %xmm0
874; SSE41-NEXT: mulss %xmm1, %xmm0
875; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
876; SSE41-NEXT: mulss %xmm4, %xmm0
877; SSE41-NEXT: movaps %xmm1, %xmm4
878; SSE41-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
879; SSE41-NEXT: mulss %xmm4, %xmm0
880; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
881; SSE41-NEXT: mulss %xmm1, %xmm0
882; SSE41-NEXT: mulss %xmm2, %xmm0
883; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
884; SSE41-NEXT: mulss %xmm1, %xmm0
885; SSE41-NEXT: movaps %xmm2, %xmm1
886; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
887; SSE41-NEXT: mulss %xmm1, %xmm0
888; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
889; SSE41-NEXT: mulss %xmm2, %xmm0
890; SSE41-NEXT: mulss %xmm3, %xmm0
891; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
892; SSE41-NEXT: mulss %xmm1, %xmm0
893; SSE41-NEXT: movaps %xmm3, %xmm1
894; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
895; SSE41-NEXT: mulss %xmm1, %xmm0
896; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
897; SSE41-NEXT: mulss %xmm3, %xmm0
898; SSE41-NEXT: retq
899;
900; AVX-LABEL: test_v16f32_undef:
901; AVX: # %bb.0:
902; AVX-NEXT: vmulss %xmm0, %xmm0, %xmm2
903; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
904; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
905; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
906; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
907; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
908; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
909; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
910; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2
911; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
912; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
913; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
914; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
915; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
916; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0
917; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
918; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
919; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
920; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
921; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
922; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
923; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
924; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
925; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
926; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
927; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
928; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
929; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
930; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
931; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
932; AVX-NEXT: vzeroupper
933; AVX-NEXT: retq
934;
935; AVX512-LABEL: test_v16f32_undef:
936; AVX512: # %bb.0:
937; AVX512-NEXT: vmulss %xmm0, %xmm0, %xmm1
938; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
939; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
940; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
941; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
942; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
943; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
944; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
945; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
946; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
947; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
948; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
949; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
950; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
951; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
952; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
953; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
954; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
955; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
956; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
957; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
958; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
959; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
960; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
961; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1
962; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
963; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
964; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
965; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
966; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
967; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
968; AVX512-NEXT: vzeroupper
969; AVX512-NEXT: retq
970 %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
971 ret float %1
972}
973
974;
975; vXf64 (accum)
976;
977
978define double @test_v2f64(double %a0, <2 x double> %a1) {
979; SSE-LABEL: test_v2f64:
980; SSE: # %bb.0:
981; SSE-NEXT: mulsd %xmm1, %xmm0
982; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
983; SSE-NEXT: mulsd %xmm1, %xmm0
984; SSE-NEXT: retq
985;
986; AVX-LABEL: test_v2f64:
987; AVX: # %bb.0:
988; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
989; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
990; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
991; AVX-NEXT: retq
992;
993; AVX512-LABEL: test_v2f64:
994; AVX512: # %bb.0:
995; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
996; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
997; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
998; AVX512-NEXT: retq
999 %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
1000 ret double %1
1001}
1002
1003define double @test_v4f64(double %a0, <4 x double> %a1) {
1004; SSE-LABEL: test_v4f64:
1005; SSE: # %bb.0:
1006; SSE-NEXT: mulsd %xmm1, %xmm0
1007; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
1008; SSE-NEXT: mulsd %xmm1, %xmm0
1009; SSE-NEXT: mulsd %xmm2, %xmm0
1010; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
1011; SSE-NEXT: mulsd %xmm2, %xmm0
1012; SSE-NEXT: retq
1013;
1014; AVX-LABEL: test_v4f64:
1015; AVX: # %bb.0:
1016; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1017; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1018; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1019; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1020; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1021; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1022; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1023; AVX-NEXT: vzeroupper
1024; AVX-NEXT: retq
1025;
1026; AVX512-LABEL: test_v4f64:
1027; AVX512: # %bb.0:
1028; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1029; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1030; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1031; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
1032; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1033; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1034; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1035; AVX512-NEXT: vzeroupper
1036; AVX512-NEXT: retq
1037 %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
1038 ret double %1
1039}
1040
1041define double @test_v8f64(double %a0, <8 x double> %a1) {
1042; SSE-LABEL: test_v8f64:
1043; SSE: # %bb.0:
1044; SSE-NEXT: mulsd %xmm1, %xmm0
1045; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
1046; SSE-NEXT: mulsd %xmm1, %xmm0
1047; SSE-NEXT: mulsd %xmm2, %xmm0
1048; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
1049; SSE-NEXT: mulsd %xmm2, %xmm0
1050; SSE-NEXT: mulsd %xmm3, %xmm0
1051; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
1052; SSE-NEXT: mulsd %xmm3, %xmm0
1053; SSE-NEXT: mulsd %xmm4, %xmm0
1054; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1]
1055; SSE-NEXT: mulsd %xmm4, %xmm0
1056; SSE-NEXT: retq
1057;
1058; AVX-LABEL: test_v8f64:
1059; AVX: # %bb.0:
1060; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1061; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1062; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0
1063; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1064; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1065; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1066; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1067; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1068; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1069; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1070; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
1071; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1072; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1073; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1074; AVX-NEXT: vzeroupper
1075; AVX-NEXT: retq
1076;
1077; AVX512-LABEL: test_v8f64:
1078; AVX512: # %bb.0:
1079; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1080; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1081; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1082; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
1083; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1084; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1085; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1086; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
1087; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1088; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1089; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1090; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
1091; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1092; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1093; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1094; AVX512-NEXT: vzeroupper
1095; AVX512-NEXT: retq
1096 %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
1097 ret double %1
1098}
1099
1100define double @test_v16f64(double %a0, <16 x double> %a1) {
1101; SSE-LABEL: test_v16f64:
1102; SSE: # %bb.0:
1103; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
1104; SSE-NEXT: mulsd %xmm1, %xmm0
1105; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
1106; SSE-NEXT: mulsd %xmm1, %xmm0
1107; SSE-NEXT: mulsd %xmm2, %xmm0
1108; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
1109; SSE-NEXT: mulsd %xmm2, %xmm0
1110; SSE-NEXT: mulsd %xmm3, %xmm0
1111; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
1112; SSE-NEXT: mulsd %xmm3, %xmm0
1113; SSE-NEXT: mulsd %xmm4, %xmm0
1114; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1]
1115; SSE-NEXT: mulsd %xmm4, %xmm0
1116; SSE-NEXT: mulsd %xmm5, %xmm0
1117; SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1]
1118; SSE-NEXT: mulsd %xmm5, %xmm0
1119; SSE-NEXT: mulsd %xmm6, %xmm0
1120; SSE-NEXT: movhlps {{.*#+}} xmm6 = xmm6[1,1]
1121; SSE-NEXT: mulsd %xmm6, %xmm0
1122; SSE-NEXT: mulsd %xmm7, %xmm0
1123; SSE-NEXT: movhlps {{.*#+}} xmm7 = xmm7[1,1]
1124; SSE-NEXT: mulsd %xmm7, %xmm0
1125; SSE-NEXT: mulsd %xmm8, %xmm0
1126; SSE-NEXT: movhlps {{.*#+}} xmm8 = xmm8[1,1]
1127; SSE-NEXT: mulsd %xmm8, %xmm0
1128; SSE-NEXT: retq
1129;
1130; AVX-LABEL: test_v16f64:
1131; AVX: # %bb.0:
1132; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1133; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
1134; AVX-NEXT: vmulsd %xmm5, %xmm0, %xmm0
1135; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1136; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1137; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1138; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1139; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1140; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1141; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1142; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
1143; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1144; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1145; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1146; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0
1147; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1148; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1149; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1
1150; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1151; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1152; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1153; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0
1154; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm4[1,0]
1155; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1156; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1
1157; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1158; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1159; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1160; AVX-NEXT: vzeroupper
1161; AVX-NEXT: retq
1162;
1163; AVX512-LABEL: test_v16f64:
1164; AVX512: # %bb.0:
1165; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1166; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1167; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0
1168; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3
1169; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0
1170; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1171; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0
1172; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3
1173; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0
1174; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1175; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0
1176; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
1177; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1178; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1179; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1180; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1181; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1182; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1183; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm1
1184; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1185; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1186; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1187; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm1
1188; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1189; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1190; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1191; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm1
1192; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1193; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1194; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1195; AVX512-NEXT: vzeroupper
1196; AVX512-NEXT: retq
1197 %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
1198 ret double %1
1199}
1200
1201;
1202; vXf64 (one)
1203;
1204
1205define double @test_v2f64_one(<2 x double> %a0) {
1206; SSE-LABEL: test_v2f64_one:
1207; SSE: # %bb.0:
1208; SSE-NEXT: movaps %xmm0, %xmm1
1209; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
1210; SSE-NEXT: mulsd %xmm0, %xmm1
1211; SSE-NEXT: movapd %xmm1, %xmm0
1212; SSE-NEXT: retq
1213;
1214; AVX-LABEL: test_v2f64_one:
1215; AVX: # %bb.0:
1216; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1217; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1218; AVX-NEXT: retq
1219;
1220; AVX512-LABEL: test_v2f64_one:
1221; AVX512: # %bb.0:
1222; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1223; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1224; AVX512-NEXT: retq
1225 %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
1226 ret double %1
1227}
1228
1229define double @test_v4f64_one(<4 x double> %a0) {
1230; SSE-LABEL: test_v4f64_one:
1231; SSE: # %bb.0:
1232; SSE-NEXT: movaps %xmm0, %xmm2
1233; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
1234; SSE-NEXT: mulsd %xmm0, %xmm2
1235; SSE-NEXT: mulsd %xmm1, %xmm2
1236; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
1237; SSE-NEXT: mulsd %xmm1, %xmm2
1238; SSE-NEXT: movapd %xmm2, %xmm0
1239; SSE-NEXT: retq
1240;
1241; AVX-LABEL: test_v4f64_one:
1242; AVX: # %bb.0:
1243; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1244; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm1
1245; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1246; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm1
1247; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1248; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
1249; AVX-NEXT: vzeroupper
1250; AVX-NEXT: retq
1251;
1252; AVX512-LABEL: test_v4f64_one:
1253; AVX512: # %bb.0:
1254; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1255; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm1
1256; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
1257; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1
1258; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1259; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
1260; AVX512-NEXT: vzeroupper
1261; AVX512-NEXT: retq
1262 %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
1263 ret double %1
1264}
1265
1266define double @test_v8f64_one(<8 x double> %a0) {
1267; SSE-LABEL: test_v8f64_one:
1268; SSE: # %bb.0:
1269; SSE-NEXT: movaps %xmm0, %xmm4
1270; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1]
1271; SSE-NEXT: mulsd %xmm0, %xmm4
1272; SSE-NEXT: mulsd %xmm1, %xmm4
1273; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
1274; SSE-NEXT: mulsd %xmm1, %xmm4
1275; SSE-NEXT: mulsd %xmm2, %xmm4
1276; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
1277; SSE-NEXT: mulsd %xmm2, %xmm4
1278; SSE-NEXT: mulsd %xmm3, %xmm4
1279; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
1280; SSE-NEXT: mulsd %xmm3, %xmm4
1281; SSE-NEXT: movapd %xmm4, %xmm0
1282; SSE-NEXT: retq
1283;
1284; AVX-LABEL: test_v8f64_one:
1285; AVX: # %bb.0:
1286; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1287; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm2
1288; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1289; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm2
1290; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1291; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm0
1292; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1293; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1294; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1295; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1296; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1297; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1298; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1299; AVX-NEXT: vzeroupper
1300; AVX-NEXT: retq
1301;
1302; AVX512-LABEL: test_v8f64_one:
1303; AVX512: # %bb.0:
1304; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1305; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm1
1306; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
1307; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
1308; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1309; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
1310; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
1311; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
1312; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1313; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
1314; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
1315; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1
1316; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1317; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
1318; AVX512-NEXT: vzeroupper
1319; AVX512-NEXT: retq
1320 %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
1321 ret double %1
1322}
1323
1324define double @test_v16f64_one(<16 x double> %a0) {
1325; SSE-LABEL: test_v16f64_one:
1326; SSE: # %bb.0:
1327; SSE-NEXT: movaps %xmm0, %xmm8
1328; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
1329; SSE-NEXT: mulsd %xmm8, %xmm0
1330; SSE-NEXT: mulsd %xmm1, %xmm0
1331; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
1332; SSE-NEXT: mulsd %xmm1, %xmm0
1333; SSE-NEXT: mulsd %xmm2, %xmm0
1334; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
1335; SSE-NEXT: mulsd %xmm2, %xmm0
1336; SSE-NEXT: mulsd %xmm3, %xmm0
1337; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
1338; SSE-NEXT: mulsd %xmm3, %xmm0
1339; SSE-NEXT: mulsd %xmm4, %xmm0
1340; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1]
1341; SSE-NEXT: mulsd %xmm4, %xmm0
1342; SSE-NEXT: mulsd %xmm5, %xmm0
1343; SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1]
1344; SSE-NEXT: mulsd %xmm5, %xmm0
1345; SSE-NEXT: mulsd %xmm6, %xmm0
1346; SSE-NEXT: movhlps {{.*#+}} xmm6 = xmm6[1,1]
1347; SSE-NEXT: mulsd %xmm6, %xmm0
1348; SSE-NEXT: mulsd %xmm7, %xmm0
1349; SSE-NEXT: movhlps {{.*#+}} xmm7 = xmm7[1,1]
1350; SSE-NEXT: mulsd %xmm7, %xmm0
1351; SSE-NEXT: retq
1352;
1353; AVX-LABEL: test_v16f64_one:
1354; AVX: # %bb.0:
1355; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
1356; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm4
1357; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1358; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm4
1359; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1360; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm0
1361; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1362; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1363; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0
1364; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1365; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1366; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1367; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1368; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1369; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1370; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1371; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
1372; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1373; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1374; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1375; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0
1376; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1377; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1378; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1
1379; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1380; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1381; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1382; AVX-NEXT: vzeroupper
1383; AVX-NEXT: retq
1384;
1385; AVX512-LABEL: test_v16f64_one:
1386; AVX512: # %bb.0:
1387; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1388; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm2
1389; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3
1390; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
1391; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1392; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
1393; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3
1394; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
1395; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1396; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
1397; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
1398; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm2
1399; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1400; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm0
1401; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1402; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1403; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1404; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
1405; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1406; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1407; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1408; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
1409; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1410; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1411; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1412; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
1413; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1414; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1415; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1416; AVX512-NEXT: vzeroupper
1417; AVX512-NEXT: retq
1418 %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
1419 ret double %1
1420}
1421
1422;
1423; vXf64 (undef)
1424;
1425
1426define double @test_v2f64_undef(<2 x double> %a0) {
1427; SSE-LABEL: test_v2f64_undef:
1428; SSE: # %bb.0:
1429; SSE-NEXT: movapd %xmm0, %xmm1
1430; SSE-NEXT: mulsd %xmm0, %xmm1
1431; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
1432; SSE-NEXT: mulsd %xmm1, %xmm0
1433; SSE-NEXT: retq
1434;
1435; AVX-LABEL: test_v2f64_undef:
1436; AVX: # %bb.0:
1437; AVX-NEXT: vmulsd %xmm0, %xmm0, %xmm1
1438; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1439; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
1440; AVX-NEXT: retq
1441;
1442; AVX512-LABEL: test_v2f64_undef:
1443; AVX512: # %bb.0:
1444; AVX512-NEXT: vmulsd %xmm0, %xmm0, %xmm1
1445; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1446; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
1447; AVX512-NEXT: retq
1448 %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
1449 ret double %1
1450}
1451
1452define double @test_v4f64_undef(<4 x double> %a0) {
1453; SSE-LABEL: test_v4f64_undef:
1454; SSE: # %bb.0:
1455; SSE-NEXT: movapd %xmm0, %xmm2
1456; SSE-NEXT: mulsd %xmm0, %xmm2
1457; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
1458; SSE-NEXT: mulsd %xmm2, %xmm0
1459; SSE-NEXT: mulsd %xmm1, %xmm0
1460; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
1461; SSE-NEXT: mulsd %xmm1, %xmm0
1462; SSE-NEXT: retq
1463;
1464; AVX-LABEL: test_v4f64_undef:
1465; AVX: # %bb.0:
1466; AVX-NEXT: vmulsd %xmm0, %xmm0, %xmm1
1467; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1468; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
1469; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1470; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm1
1471; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1472; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
1473; AVX-NEXT: vzeroupper
1474; AVX-NEXT: retq
1475;
1476; AVX512-LABEL: test_v4f64_undef:
1477; AVX512: # %bb.0:
1478; AVX512-NEXT: vmulsd %xmm0, %xmm0, %xmm1
1479; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1480; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
1481; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
1482; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1
1483; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1484; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
1485; AVX512-NEXT: vzeroupper
1486; AVX512-NEXT: retq
1487 %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
1488 ret double %1
1489}
1490
1491define double @test_v8f64_undef(<8 x double> %a0) {
1492; SSE-LABEL: test_v8f64_undef:
1493; SSE: # %bb.0:
1494; SSE-NEXT: movapd %xmm0, %xmm4
1495; SSE-NEXT: mulsd %xmm0, %xmm4
1496; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
1497; SSE-NEXT: mulsd %xmm4, %xmm0
1498; SSE-NEXT: mulsd %xmm1, %xmm0
1499; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
1500; SSE-NEXT: mulsd %xmm1, %xmm0
1501; SSE-NEXT: mulsd %xmm2, %xmm0
1502; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
1503; SSE-NEXT: mulsd %xmm2, %xmm0
1504; SSE-NEXT: mulsd %xmm3, %xmm0
1505; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
1506; SSE-NEXT: mulsd %xmm3, %xmm0
1507; SSE-NEXT: retq
1508;
1509; AVX-LABEL: test_v8f64_undef:
1510; AVX: # %bb.0:
1511; AVX-NEXT: vmulsd %xmm0, %xmm0, %xmm2
1512; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
1513; AVX-NEXT: vmulsd %xmm3, %xmm2, %xmm2
1514; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1515; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm2
1516; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1517; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm0
1518; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1519; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1520; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1521; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1522; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1523; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1524; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1525; AVX-NEXT: vzeroupper
1526; AVX-NEXT: retq
1527;
1528; AVX512-LABEL: test_v8f64_undef:
1529; AVX512: # %bb.0:
1530; AVX512-NEXT: vmulsd %xmm0, %xmm0, %xmm1
1531; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1532; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
1533; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
1534; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
1535; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1536; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
1537; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
1538; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
1539; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1540; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
1541; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
1542; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1
1543; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1544; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
1545; AVX512-NEXT: vzeroupper
1546; AVX512-NEXT: retq
1547 %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
1548 ret double %1
1549}
1550
1551define double @test_v16f64_undef(<16 x double> %a0) {
1552; SSE-LABEL: test_v16f64_undef:
1553; SSE: # %bb.0:
1554; SSE-NEXT: movapd %xmm0, %xmm8
1555; SSE-NEXT: mulsd %xmm0, %xmm8
1556; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
1557; SSE-NEXT: mulsd %xmm8, %xmm0
1558; SSE-NEXT: mulsd %xmm1, %xmm0
1559; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
1560; SSE-NEXT: mulsd %xmm1, %xmm0
1561; SSE-NEXT: mulsd %xmm2, %xmm0
1562; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
1563; SSE-NEXT: mulsd %xmm2, %xmm0
1564; SSE-NEXT: mulsd %xmm3, %xmm0
1565; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
1566; SSE-NEXT: mulsd %xmm3, %xmm0
1567; SSE-NEXT: mulsd %xmm4, %xmm0
1568; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1]
1569; SSE-NEXT: mulsd %xmm4, %xmm0
1570; SSE-NEXT: mulsd %xmm5, %xmm0
1571; SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1]
1572; SSE-NEXT: mulsd %xmm5, %xmm0
1573; SSE-NEXT: mulsd %xmm6, %xmm0
1574; SSE-NEXT: movhlps {{.*#+}} xmm6 = xmm6[1,1]
1575; SSE-NEXT: mulsd %xmm6, %xmm0
1576; SSE-NEXT: mulsd %xmm7, %xmm0
1577; SSE-NEXT: movhlps {{.*#+}} xmm7 = xmm7[1,1]
1578; SSE-NEXT: mulsd %xmm7, %xmm0
1579; SSE-NEXT: retq
1580;
1581; AVX-LABEL: test_v16f64_undef:
1582; AVX: # %bb.0:
1583; AVX-NEXT: vmulsd %xmm0, %xmm0, %xmm4
1584; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
1585; AVX-NEXT: vmulsd %xmm5, %xmm4, %xmm4
1586; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1587; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm4
1588; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1589; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm0
1590; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1591; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1592; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0
1593; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
1594; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1595; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1596; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1597; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1598; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1599; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1600; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
1601; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1602; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1603; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1604; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0
1605; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1606; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1607; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1
1608; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1609; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1610; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1611; AVX-NEXT: vzeroupper
1612; AVX-NEXT: retq
1613;
1614; AVX512-LABEL: test_v16f64_undef:
1615; AVX512: # %bb.0:
1616; AVX512-NEXT: vmulsd %xmm0, %xmm0, %xmm2
1617; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
1618; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
1619; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3
1620; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
1621; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1622; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
1623; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3
1624; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
1625; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1626; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
1627; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
1628; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm2
1629; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1630; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm0
1631; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1632; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1633; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1634; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
1635; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1636; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1637; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1638; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
1639; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1640; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1641; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
1642; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
1643; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1644; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1645; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1646; AVX512-NEXT: vzeroupper
1647; AVX512-NEXT: retq
1648 %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
1649 ret double %1
1650}
1651
1652declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>)
1653declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>)
1654declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>)
1655declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>)
1656
1657declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>)
1658declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>)
1659declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>)
1660declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>)