blob: e422550266924e542de644927d256573f0f9c416 [file] [log] [blame]
Matt Arsenault770ec862016-12-22 03:55:35 +00001; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
2; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
3; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
4; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
5
6; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
7; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
8; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
9; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
10
11; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
12
13target triple = "amdgcn--"
14
15
16declare i32 @llvm.amdgcn.workitem.id.x() #1
17declare float @llvm.fmuladd.f32(float, float, float) #1
18declare half @llvm.fmuladd.f16(half, half, half) #1
19declare float @llvm.fabs.f32(float) #1
20
21; GCN-LABEL: {{^}}fmuladd_f32:
22; GCN-FLUSH: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
23
24; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
25
26; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
27; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000028define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
Matt Arsenault770ec862016-12-22 03:55:35 +000029 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
30 %r0 = load float, float addrspace(1)* %in1
31 %r1 = load float, float addrspace(1)* %in2
32 %r2 = load float, float addrspace(1)* %in3
33 %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
34 store float %r3, float addrspace(1)* %out
35 ret void
36}
37
38; GCN-LABEL: {{^}}fmul_fadd_f32:
39; GCN-FLUSH: v_mac_f32
40
41; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32
42
43; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
44; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
45
46; GCN-DENORM-STRICT: v_mul_f32_e32
47; GCN-DENORM-STRICT: v_add_f32_e32
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000048define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
Matt Arsenault770ec862016-12-22 03:55:35 +000049 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
50 %r0 = load volatile float, float addrspace(1)* %in1
51 %r1 = load volatile float, float addrspace(1)* %in2
52 %r2 = load volatile float, float addrspace(1)* %in3
53 %mul = fmul float %r0, %r1
54 %add = fadd float %mul, %r2
55 store float %add, float addrspace(1)* %out
56 ret void
57}
58
59; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32
60; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
61; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
62
63; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
64; SI-FLUSH: buffer_store_dword [[R2]]
65; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
66
67; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
68
69; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
70; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
71
72; SI-DENORM buffer_store_dword [[RESULT]]
73; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000074define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +000075 %tid = call i32 @llvm.amdgcn.workitem.id.x()
76 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
77 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
78 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
79
80 %r1 = load volatile float, float addrspace(1)* %gep.0
81 %r2 = load volatile float, float addrspace(1)* %gep.1
82
83 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
84 store float %r3, float addrspace(1)* %gep.out
85 ret void
86}
87
88; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32
89; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
90; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
91
92; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
93; SI-FLUSH: buffer_store_dword [[R2]]
94; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
95
96; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
97
98; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
99; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
100
101; SI-DENORM: buffer_store_dword [[RESULT]]
102; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000103define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000104 %tid = call i32 @llvm.amdgcn.workitem.id.x()
105 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
106 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
107 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
108
109 %r1 = load volatile float, float addrspace(1)* %gep.0
110 %r2 = load volatile float, float addrspace(1)* %gep.1
111
112 %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
113 store float %r3, float addrspace(1)* %gep.out
114 ret void
115}
116
117; GCN-LABEL: {{^}}fadd_a_a_b_f32:
118; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
119; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
120
121; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
122; SI-FLUSH: buffer_store_dword [[R2]]
123; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
124
125; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
126
127; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
128; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
129
130; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
131; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
132
133; SI-DENORM: buffer_store_dword [[RESULT]]
134; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000135define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
Matt Arsenault770ec862016-12-22 03:55:35 +0000136 float addrspace(1)* %in1,
137 float addrspace(1)* %in2) #0 {
138 %tid = call i32 @llvm.amdgcn.workitem.id.x()
139 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
140 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
141 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
142
143 %r0 = load volatile float, float addrspace(1)* %gep.0
144 %r1 = load volatile float, float addrspace(1)* %gep.1
145
146 %add.0 = fadd float %r0, %r0
147 %add.1 = fadd float %add.0, %r1
148 store float %add.1, float addrspace(1)* %gep.out
149 ret void
150}
151
152; GCN-LABEL: {{^}}fadd_b_a_a_f32:
153; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
154; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
155
156; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
157; SI-FLUSH: buffer_store_dword [[R2]]
158; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
159
160; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
161
162; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
163; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
164
165; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
166; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
167
168; SI-DENORM: buffer_store_dword [[RESULT]]
169; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000170define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
Matt Arsenault770ec862016-12-22 03:55:35 +0000171 float addrspace(1)* %in1,
172 float addrspace(1)* %in2) #0 {
173 %tid = call i32 @llvm.amdgcn.workitem.id.x()
174 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
175 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
176 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
177
178 %r0 = load volatile float, float addrspace(1)* %gep.0
179 %r1 = load volatile float, float addrspace(1)* %gep.1
180
181 %add.0 = fadd float %r0, %r0
182 %add.1 = fadd float %r1, %add.0
183 store float %add.1, float addrspace(1)* %gep.out
184 ret void
185}
186
187; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
188; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
189; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
190; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
191
192; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
193
Chad Rosier84a238d2017-05-04 14:14:44 +0000194; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
195; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000196
197; SI-DENORM: buffer_store_dword [[RESULT]]
198; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000199define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000200 %tid = call i32 @llvm.amdgcn.workitem.id.x()
201 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
202 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
203 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
204
205 %r1 = load volatile float, float addrspace(1)* %gep.0
206 %r2 = load volatile float, float addrspace(1)* %gep.1
207
208 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
209 store float %r3, float addrspace(1)* %gep.out
210 ret void
211}
212
213; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
214; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
215; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
216
217; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
218; SI-FLUSH: buffer_store_dword [[R2]]
219; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
220
221; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
222
223; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
224; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
225
226; SI-DENORM: buffer_store_dword [[RESULT]]
227; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000228define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000229 %tid = call i32 @llvm.amdgcn.workitem.id.x()
230 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
231 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
232 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
233
234 %r1 = load volatile float, float addrspace(1)* %gep.0
235 %r2 = load volatile float, float addrspace(1)* %gep.1
236
237 %r1.fneg = fsub float -0.000000e+00, %r1
238
239 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
240 store float %r3, float addrspace(1)* %gep.out
241 ret void
242}
243
244; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32:
245; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
246; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
247
248; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
249; SI-FLUSH: buffer_store_dword [[R2]]
250; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
251
252; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
253
Chad Rosier84a238d2017-05-04 14:14:44 +0000254; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
255; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000256
257; SI-DENORM: buffer_store_dword [[RESULT]]
258; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000259define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000260 %tid = call i32 @llvm.amdgcn.workitem.id.x()
261 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
262 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
263 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
264
265 %r1 = load volatile float, float addrspace(1)* %gep.0
266 %r2 = load volatile float, float addrspace(1)* %gep.1
267
268 %r1.fneg = fsub float -0.000000e+00, %r1
269
270 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
271 store float %r3, float addrspace(1)* %gep.out
272 ret void
273}
274
275; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32:
276; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
277; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +0000278; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000279; SI-FLUSH: buffer_store_dword [[RESULT]]
280; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
281
282; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
283
284; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
285; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
286
287; SI-DENORM: buffer_store_dword [[RESULT]]
288; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000289define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000290 %tid = call i32 @llvm.amdgcn.workitem.id.x()
291 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
292 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
293 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
294
295 %r1 = load volatile float, float addrspace(1)* %gep.0
296 %r2 = load volatile float, float addrspace(1)* %gep.1
297
298 %r2.fneg = fsub float -0.000000e+00, %r2
299
300 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
301 store float %r3, float addrspace(1)* %gep.out
302 ret void
303}
304
305; GCN-LABEL: {{^}}mad_sub_f32:
306; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
307; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
308; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
309; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
310
311; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
312
313; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
314; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
315
316; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
317; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
318
319; SI: buffer_store_dword [[RESULT]]
320; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000321define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000322 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
323 %tid.ext = sext i32 %tid to i64
324 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
325 %add1 = add i64 %tid.ext, 1
326 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
327 %add2 = add i64 %tid.ext, 2
328 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
329 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
330 %a = load volatile float, float addrspace(1)* %gep0, align 4
331 %b = load volatile float, float addrspace(1)* %gep1, align 4
332 %c = load volatile float, float addrspace(1)* %gep2, align 4
333 %mul = fmul float %a, %b
334 %sub = fsub float %mul, %c
335 store float %sub, float addrspace(1)* %outgep, align 4
336 ret void
337}
338
339; GCN-LABEL: {{^}}mad_sub_inv_f32:
340; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
341; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
342; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
343
344; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
345
346; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
347
348; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
349; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
350
351; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
352; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
353
354; SI: buffer_store_dword [[RESULT]]
355; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000356define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000357 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
358 %tid.ext = sext i32 %tid to i64
359 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
360 %add1 = add i64 %tid.ext, 1
361 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
362 %add2 = add i64 %tid.ext, 2
363 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
364 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
365 %a = load volatile float, float addrspace(1)* %gep0, align 4
366 %b = load volatile float, float addrspace(1)* %gep1, align 4
367 %c = load volatile float, float addrspace(1)* %gep2, align 4
368 %mul = fmul float %a, %b
369 %sub = fsub float %c, %mul
370 store float %sub, float addrspace(1)* %outgep, align 4
371 ret void
372}
373
374; GCN-LABEL: {{^}}mad_sub_fabs_f32:
375; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
376; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
377; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
378; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
379
380; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
381
382; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
383; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
384
385; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
386; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
387
388; SI: buffer_store_dword [[RESULT]]
389; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000390define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000391 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
392 %tid.ext = sext i32 %tid to i64
393 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
394 %add1 = add i64 %tid.ext, 1
395 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
396 %add2 = add i64 %tid.ext, 2
397 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
398 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
399 %a = load volatile float, float addrspace(1)* %gep0, align 4
400 %b = load volatile float, float addrspace(1)* %gep1, align 4
401 %c = load volatile float, float addrspace(1)* %gep2, align 4
402 %c.abs = call float @llvm.fabs.f32(float %c) #0
403 %mul = fmul float %a, %b
404 %sub = fsub float %mul, %c.abs
405 store float %sub, float addrspace(1)* %outgep, align 4
406 ret void
407}
408
409; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32:
410; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
411; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
412; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
413; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
414
415; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
416
417; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
418; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
419
420; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
421; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
422
423; SI: buffer_store_dword [[RESULT]]
424; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000425define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000426 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
427 %tid.ext = sext i32 %tid to i64
428 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
429 %add1 = add i64 %tid.ext, 1
430 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
431 %add2 = add i64 %tid.ext, 2
432 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
433 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
434 %a = load volatile float, float addrspace(1)* %gep0, align 4
435 %b = load volatile float, float addrspace(1)* %gep1, align 4
436 %c = load volatile float, float addrspace(1)* %gep2, align 4
437 %c.abs = call float @llvm.fabs.f32(float %c) #0
438 %mul = fmul float %a, %b
439 %sub = fsub float %c.abs, %mul
440 store float %sub, float addrspace(1)* %outgep, align 4
441 ret void
442}
443
444; GCN-LABEL: {{^}}neg_neg_mad_f32:
445; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
446; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
447; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
448
449; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGB]], [[REGA]]
450; SI-FLUSH: buffer_store_dword [[REGC]]
451; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
452
453; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
454
455; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
456; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
457
458; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
459; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
460
461; SI-DENORM: buffer_store_dword [[RESULT]]
462; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000463define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000464 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
465 %tid.ext = sext i32 %tid to i64
466 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
467 %add1 = add i64 %tid.ext, 1
468 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
469 %add2 = add i64 %tid.ext, 2
470 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
471 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
472 %a = load volatile float, float addrspace(1)* %gep0, align 4
473 %b = load volatile float, float addrspace(1)* %gep1, align 4
474 %c = load volatile float, float addrspace(1)* %gep2, align 4
475 %nega = fsub float -0.000000e+00, %a
476 %negb = fsub float -0.000000e+00, %b
477 %mul = fmul float %nega, %negb
478 %sub = fadd float %mul, %c
479 store float %sub, float addrspace(1)* %outgep, align 4
480 ret void
481}
482
483; GCN-LABEL: {{^}}mad_fabs_sub_f32:
484; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
485; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
486; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
487; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
488
489; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
490
491; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
492; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
493
494; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
495; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
496
497; SI: buffer_store_dword [[RESULT]]
498; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000499define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000500 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
501 %tid.ext = sext i32 %tid to i64
502 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
503 %add1 = add i64 %tid.ext, 1
504 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
505 %add2 = add i64 %tid.ext, 2
506 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
507 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
508 %a = load volatile float, float addrspace(1)* %gep0, align 4
509 %b = load volatile float, float addrspace(1)* %gep1, align 4
510 %c = load volatile float, float addrspace(1)* %gep2, align 4
511 %b.abs = call float @llvm.fabs.f32(float %b) #0
512 %mul = fmul float %a, %b.abs
513 %sub = fsub float %mul, %c
514 store float %sub, float addrspace(1)* %outgep, align 4
515 ret void
516}
517
518; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32:
519; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
520; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
521; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
522; SI-FLUSH: buffer_store_dword [[R2]]
523; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
524
525; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
526
527; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
528; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
529
530; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
531; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
532
533; SI-DENORM: buffer_store_dword [[RESULT]]
534; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000535define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000536 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
537 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
538 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
539 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
540
541 %r1 = load volatile float, float addrspace(1)* %gep.0
542 %r2 = load volatile float, float addrspace(1)* %gep.1
543
544 %add = fadd float %r1, %r1
545 %r3 = fsub float %r2, %add
546
547 store float %r3, float addrspace(1)* %gep.out
548 ret void
549}
550
551; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32:
552; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
553; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +0000554; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000555
556; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
557
558; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
559; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
560
561; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
562; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
563
564; SI: buffer_store_dword [[RESULT]]
565; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000566define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000567 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
568 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
569 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
570 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
571
572 %r1 = load volatile float, float addrspace(1)* %gep.0
573 %r2 = load volatile float, float addrspace(1)* %gep.1
574
575 %add = fadd float %r1, %r1
576 %r3 = fsub float %add, %r2
577
578 store float %r3, float addrspace(1)* %gep.out
579 ret void
580}
581
582attributes #0 = { nounwind }
583attributes #1 = { nounwind readnone }