blob: bd8888966cf2c38a634b1a19336fe4daa230d940 [file] [log] [blame]
Simon Pilgrim63892402017-05-19 17:19:26 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_256 %s
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_512 %s
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck -check-prefix=FMA4 %s
5
6; This test checks the fusing of MUL + SUB/ADD to FMSUBADD.
7
8define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
9; FMA3_256-LABEL: mul_subadd_pd128:
10; FMA3_256: # BB#0: # %entry
11; FMA3_256-NEXT: vmulpd %xmm1, %xmm0, %xmm0
12; FMA3_256-NEXT: vsubpd %xmm2, %xmm0, %xmm1
13; FMA3_256-NEXT: vaddpd %xmm2, %xmm0, %xmm0
14; FMA3_256-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
15; FMA3_256-NEXT: retq
16;
17; FMA3_512-LABEL: mul_subadd_pd128:
18; FMA3_512: # BB#0: # %entry
19; FMA3_512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
20; FMA3_512-NEXT: vsubpd %xmm2, %xmm0, %xmm1
21; FMA3_512-NEXT: vaddpd %xmm2, %xmm0, %xmm0
22; FMA3_512-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
23; FMA3_512-NEXT: retq
24;
25; FMA4-LABEL: mul_subadd_pd128:
26; FMA4: # BB#0: # %entry
27; FMA4-NEXT: vmulpd %xmm1, %xmm0, %xmm0
28; FMA4-NEXT: vsubpd %xmm2, %xmm0, %xmm1
29; FMA4-NEXT: vaddpd %xmm2, %xmm0, %xmm0
30; FMA4-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
31; FMA4-NEXT: retq
32entry:
33 %AB = fmul <2 x double> %A, %B
34 %Sub = fsub <2 x double> %AB, %C
35 %Add = fadd <2 x double> %AB, %C
36 %subadd = shufflevector <2 x double> %Add, <2 x double> %Sub, <2 x i32> <i32 0, i32 3>
37 ret <2 x double> %subadd
38}
39
40define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
41; FMA3-LABEL: mul_subadd_ps128:
42; FMA3: # BB#0: # %entry
43; FMA3-NEXT: vmulps %xmm1, %xmm0, %xmm0
44; FMA3-NEXT: vsubps %xmm2, %xmm0, %xmm1
45; FMA3-NEXT: vaddps %xmm2, %xmm0, %xmm0
46; FMA3-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
47; FMA3-NEXT: retq
48;
49; FMA4-LABEL: mul_subadd_ps128:
50; FMA4: # BB#0: # %entry
51; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0
52; FMA4-NEXT: vsubps %xmm2, %xmm0, %xmm1
53; FMA4-NEXT: vaddps %xmm2, %xmm0, %xmm0
54; FMA4-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
55; FMA4-NEXT: retq
56entry:
57 %AB = fmul <4 x float> %A, %B
58 %Sub = fsub <4 x float> %AB, %C
59 %Add = fadd <4 x float> %AB, %C
60 %subadd = shufflevector <4 x float> %Add, <4 x float> %Sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
61 ret <4 x float> %subadd
62}
63
64define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
65; FMA3-LABEL: mul_subadd_pd256:
66; FMA3: # BB#0: # %entry
67; FMA3-NEXT: vmulpd %ymm1, %ymm0, %ymm0
68; FMA3-NEXT: vsubpd %ymm2, %ymm0, %ymm1
69; FMA3-NEXT: vaddpd %ymm2, %ymm0, %ymm0
70; FMA3-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
71; FMA3-NEXT: retq
72;
73; FMA4-LABEL: mul_subadd_pd256:
74; FMA4: # BB#0: # %entry
75; FMA4-NEXT: vmulpd %ymm1, %ymm0, %ymm0
76; FMA4-NEXT: vsubpd %ymm2, %ymm0, %ymm1
77; FMA4-NEXT: vaddpd %ymm2, %ymm0, %ymm0
78; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
79; FMA4-NEXT: retq
80entry:
81 %AB = fmul <4 x double> %A, %B
82 %Sub = fsub <4 x double> %AB, %C
83 %Add = fadd <4 x double> %AB, %C
84 %subadd = shufflevector <4 x double> %Add, <4 x double> %Sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
85 ret <4 x double> %subadd
86}
87
88define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
89; FMA3-LABEL: mul_subadd_ps256:
90; FMA3: # BB#0: # %entry
91; FMA3-NEXT: vmulps %ymm1, %ymm0, %ymm0
92; FMA3-NEXT: vsubps %ymm2, %ymm0, %ymm1
93; FMA3-NEXT: vaddps %ymm2, %ymm0, %ymm0
94; FMA3-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
95; FMA3-NEXT: retq
96;
97; FMA4-LABEL: mul_subadd_ps256:
98; FMA4: # BB#0: # %entry
99; FMA4-NEXT: vmulps %ymm1, %ymm0, %ymm0
100; FMA4-NEXT: vsubps %ymm2, %ymm0, %ymm1
101; FMA4-NEXT: vaddps %ymm2, %ymm0, %ymm0
102; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
103; FMA4-NEXT: retq
104entry:
105 %AB = fmul <8 x float> %A, %B
106 %Sub = fsub <8 x float> %AB, %C
107 %Add = fadd <8 x float> %AB, %C
108 %subadd = shufflevector <8 x float> %Add, <8 x float> %Sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
109 ret <8 x float> %subadd
110}
111
112define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
113; FMA3_256-LABEL: mul_subadd_pd512:
114; FMA3_256: # BB#0: # %entry
115; FMA3_256-NEXT: vmulpd %ymm2, %ymm0, %ymm0
116; FMA3_256-NEXT: vmulpd %ymm3, %ymm1, %ymm1
117; FMA3_256-NEXT: vsubpd %ymm5, %ymm1, %ymm2
118; FMA3_256-NEXT: vsubpd %ymm4, %ymm0, %ymm3
119; FMA3_256-NEXT: vaddpd %ymm5, %ymm1, %ymm1
120; FMA3_256-NEXT: vaddpd %ymm4, %ymm0, %ymm0
121; FMA3_256-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
122; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
123; FMA3_256-NEXT: retq
124;
125; FMA3_512-LABEL: mul_subadd_pd512:
126; FMA3_512: # BB#0: # %entry
127; FMA3_512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
128; FMA3_512-NEXT: vsubpd %zmm2, %zmm0, %zmm1
129; FMA3_512-NEXT: vaddpd %zmm2, %zmm0, %zmm0
130; FMA3_512-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[7]
131; FMA3_512-NEXT: retq
132;
133; FMA4-LABEL: mul_subadd_pd512:
134; FMA4: # BB#0: # %entry
135; FMA4-NEXT: vmulpd %ymm2, %ymm0, %ymm0
136; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1
137; FMA4-NEXT: vsubpd %ymm5, %ymm1, %ymm2
138; FMA4-NEXT: vsubpd %ymm4, %ymm0, %ymm3
139; FMA4-NEXT: vaddpd %ymm5, %ymm1, %ymm1
140; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0
141; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
142; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
143; FMA4-NEXT: retq
144entry:
145 %AB = fmul <8 x double> %A, %B
146 %Sub = fsub <8 x double> %AB, %C
147 %Add = fadd <8 x double> %AB, %C
148 %subadd = shufflevector <8 x double> %Add, <8 x double> %Sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
149 ret <8 x double> %subadd
150}
151
152define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
153; FMA3_256-LABEL: mul_subadd_ps512:
154; FMA3_256: # BB#0: # %entry
155; FMA3_256-NEXT: vmulps %ymm2, %ymm0, %ymm0
156; FMA3_256-NEXT: vmulps %ymm3, %ymm1, %ymm1
157; FMA3_256-NEXT: vsubps %ymm5, %ymm1, %ymm2
158; FMA3_256-NEXT: vsubps %ymm4, %ymm0, %ymm3
159; FMA3_256-NEXT: vaddps %ymm5, %ymm1, %ymm1
160; FMA3_256-NEXT: vaddps %ymm4, %ymm0, %ymm0
161; FMA3_256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
162; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
163; FMA3_256-NEXT: retq
164;
165; FMA3_512-LABEL: mul_subadd_ps512:
166; FMA3_512: # BB#0: # %entry
167; FMA3_512-NEXT: vmulps %zmm1, %zmm0, %zmm1
168; FMA3_512-NEXT: vaddps %zmm2, %zmm1, %zmm0
169; FMA3_512-NEXT: movw $-21846, %ax # imm = 0xAAAA
170; FMA3_512-NEXT: kmovw %eax, %k1
171; FMA3_512-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1}
172; FMA3_512-NEXT: retq
173;
174; FMA4-LABEL: mul_subadd_ps512:
175; FMA4: # BB#0: # %entry
176; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0
177; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1
178; FMA4-NEXT: vsubps %ymm5, %ymm1, %ymm2
179; FMA4-NEXT: vsubps %ymm4, %ymm0, %ymm3
180; FMA4-NEXT: vaddps %ymm5, %ymm1, %ymm1
181; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0
182; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
183; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
184; FMA4-NEXT: retq
185entry:
186 %AB = fmul <16 x float> %A, %B
187 %Sub = fsub <16 x float> %AB, %C
188 %Add = fadd <16 x float> %AB, %C
189 %subadd = shufflevector <16 x float> %Add, <16 x float> %Sub, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
190 ret <16 x float> %subadd
191}
192
193attributes #0 = { nounwind "unsafe-fp-math"="true" }