blob: 4ef293f9a51f31d5a9eca0ed423e39a042baa47c [file] [log] [blame]
Matt Arsenault0084adc2018-04-30 19:08:16 +00001; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
Matt Arsenault770ec862016-12-22 03:55:35 +00005
Matt Arsenault0084adc2018-04-30 19:08:16 +00006; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
7; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
8; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
9; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
10
11
12; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX900 %s
13; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX900 %s
14
15; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX906 %s
16
17; FIXME: Should probably test this, but sometimes selecting fmac is painful to match.
18; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX906 %s
19
Matt Arsenault770ec862016-12-22 03:55:35 +000020
21; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
22
23target triple = "amdgcn--"
24
25
26declare i32 @llvm.amdgcn.workitem.id.x() #1
27declare float @llvm.fmuladd.f32(float, float, float) #1
28declare half @llvm.fmuladd.f16(half, half, half) #1
29declare float @llvm.fabs.f32(float) #1
30
31; GCN-LABEL: {{^}}fmuladd_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +000032; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
33; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
Matt Arsenault770ec862016-12-22 03:55:35 +000034
35; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
36
37; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
38; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000039define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
Matt Arsenault770ec862016-12-22 03:55:35 +000040 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
41 %r0 = load float, float addrspace(1)* %in1
42 %r1 = load float, float addrspace(1)* %in2
43 %r2 = load float, float addrspace(1)* %in3
44 %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
45 store float %r3, float addrspace(1)* %out
46 ret void
47}
48
49; GCN-LABEL: {{^}}fmul_fadd_f32:
50; GCN-FLUSH: v_mac_f32
51
52; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32
53
54; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
55; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
56
57; GCN-DENORM-STRICT: v_mul_f32_e32
58; GCN-DENORM-STRICT: v_add_f32_e32
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000059define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
Matt Arsenault770ec862016-12-22 03:55:35 +000060 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
61 %r0 = load volatile float, float addrspace(1)* %in1
62 %r1 = load volatile float, float addrspace(1)* %in2
63 %r2 = load volatile float, float addrspace(1)* %in3
64 %mul = fmul float %r0, %r1
65 %add = fadd float %mul, %r2
66 store float %add, float addrspace(1)* %out
67 ret void
68}
69
70; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32
Matt Arsenault0084adc2018-04-30 19:08:16 +000071; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
72; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
Matt Arsenault770ec862016-12-22 03:55:35 +000073
Matt Arsenault0084adc2018-04-30 19:08:16 +000074; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
75; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
Matt Arsenault770ec862016-12-22 03:55:35 +000076; SI-FLUSH: buffer_store_dword [[R2]]
77; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
78
79; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
80
81; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000082; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +000083
84; SI-DENORM buffer_store_dword [[RESULT]]
85; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000086define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +000087 %tid = call i32 @llvm.amdgcn.workitem.id.x()
88 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
89 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
90 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
91
92 %r1 = load volatile float, float addrspace(1)* %gep.0
93 %r2 = load volatile float, float addrspace(1)* %gep.1
94
95 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
96 store float %r3, float addrspace(1)* %gep.out
97 ret void
98}
99
100; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32
Matt Arsenault0084adc2018-04-30 19:08:16 +0000101; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
102; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
Matt Arsenault770ec862016-12-22 03:55:35 +0000103
Matt Arsenault0084adc2018-04-30 19:08:16 +0000104; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
105; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
106
Matt Arsenault770ec862016-12-22 03:55:35 +0000107; SI-FLUSH: buffer_store_dword [[R2]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000108; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000109
110; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
111
112; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000113; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000114
115; SI-DENORM: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000116; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000117define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000118 %tid = call i32 @llvm.amdgcn.workitem.id.x()
119 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
120 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
121 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
122
123 %r1 = load volatile float, float addrspace(1)* %gep.0
124 %r2 = load volatile float, float addrspace(1)* %gep.1
125
126 %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
127 store float %r3, float addrspace(1)* %gep.out
128 ret void
129}
130
131; GCN-LABEL: {{^}}fadd_a_a_b_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +0000132; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
133; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
Matt Arsenault770ec862016-12-22 03:55:35 +0000134
135; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000136
Matt Arsenault770ec862016-12-22 03:55:35 +0000137; SI-FLUSH: buffer_store_dword [[R2]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000138; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000139
140; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
141
142; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000143; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000144
145; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000146; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000147
148; SI-DENORM: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000149; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000150define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
Matt Arsenault770ec862016-12-22 03:55:35 +0000151 float addrspace(1)* %in1,
152 float addrspace(1)* %in2) #0 {
153 %tid = call i32 @llvm.amdgcn.workitem.id.x()
154 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
155 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
156 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
157
158 %r0 = load volatile float, float addrspace(1)* %gep.0
159 %r1 = load volatile float, float addrspace(1)* %gep.1
160
161 %add.0 = fadd float %r0, %r0
162 %add.1 = fadd float %add.0, %r1
163 store float %add.1, float addrspace(1)* %gep.out
164 ret void
165}
166
167; GCN-LABEL: {{^}}fadd_b_a_a_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +0000168; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
169; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
Matt Arsenault770ec862016-12-22 03:55:35 +0000170
171; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000172
Matt Arsenault770ec862016-12-22 03:55:35 +0000173; SI-FLUSH: buffer_store_dword [[R2]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000174; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000175
176; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
177
178; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000179; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000180
181; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000182; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000183
184; SI-DENORM: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000185; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000186define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
Matt Arsenault770ec862016-12-22 03:55:35 +0000187 float addrspace(1)* %in1,
188 float addrspace(1)* %in2) #0 {
189 %tid = call i32 @llvm.amdgcn.workitem.id.x()
190 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
191 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
192 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
193
194 %r0 = load volatile float, float addrspace(1)* %gep.0
195 %r1 = load volatile float, float addrspace(1)* %gep.1
196
197 %add.0 = fadd float %r0, %r0
198 %add.1 = fadd float %r1, %add.0
199 store float %add.1, float addrspace(1)* %gep.out
200 ret void
201}
202
203; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
Matt Arsenault0084adc2018-04-30 19:08:16 +0000204; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
205; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
206; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
207; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000208
209; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
210
Chad Rosier84a238d2017-05-04 14:14:44 +0000211; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000212; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000213
214; SI-DENORM: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000215; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000216define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000217 %tid = call i32 @llvm.amdgcn.workitem.id.x()
218 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
219 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
220 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
221
222 %r1 = load volatile float, float addrspace(1)* %gep.0
223 %r2 = load volatile float, float addrspace(1)* %gep.1
224
225 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
226 store float %r3, float addrspace(1)* %gep.out
227 ret void
228}
229
Matt Arsenault0084adc2018-04-30 19:08:16 +0000230; XXX
Matt Arsenault770ec862016-12-22 03:55:35 +0000231; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
Matt Arsenault0084adc2018-04-30 19:08:16 +0000232; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
233; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
Matt Arsenault770ec862016-12-22 03:55:35 +0000234
Matt Arsenault0084adc2018-04-30 19:08:16 +0000235; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
236; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
237
Matt Arsenault770ec862016-12-22 03:55:35 +0000238; SI-FLUSH: buffer_store_dword [[R2]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000239; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000240
Matt Arsenault878827d2017-10-27 09:06:07 +0000241; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000242
243; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000244; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000245
246; SI-DENORM: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000247; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000248define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000249 %tid = call i32 @llvm.amdgcn.workitem.id.x()
250 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
251 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
252 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
253
254 %r1 = load volatile float, float addrspace(1)* %gep.0
255 %r2 = load volatile float, float addrspace(1)* %gep.1
256
257 %r1.fneg = fsub float -0.000000e+00, %r1
258
259 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
260 store float %r3, float addrspace(1)* %gep.out
261 ret void
262}
263
264; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +0000265; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
266; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
Matt Arsenault770ec862016-12-22 03:55:35 +0000267
Matt Arsenault0084adc2018-04-30 19:08:16 +0000268; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
269; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]]
270
Matt Arsenault770ec862016-12-22 03:55:35 +0000271; SI-FLUSH: buffer_store_dword [[R2]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000272; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000273
Matt Arsenault878827d2017-10-27 09:06:07 +0000274; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000275
Chad Rosier84a238d2017-05-04 14:14:44 +0000276; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000277; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000278
279; SI-DENORM: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000280; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000281define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000282 %tid = call i32 @llvm.amdgcn.workitem.id.x()
283 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
284 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
285 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
286
287 %r1 = load volatile float, float addrspace(1)* %gep.0
288 %r2 = load volatile float, float addrspace(1)* %gep.1
289
290 %r1.fneg = fsub float -0.000000e+00, %r1
291
292 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
293 store float %r3, float addrspace(1)* %gep.out
294 ret void
295}
296
297; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +0000298; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
299; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
300; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
301; GCN-FLUSH-FMAC: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
302
Matt Arsenault770ec862016-12-22 03:55:35 +0000303; SI-FLUSH: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000304; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000305
306; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
307
308; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000309; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000310
311; SI-DENORM: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000312; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000313define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000314 %tid = call i32 @llvm.amdgcn.workitem.id.x()
315 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
316 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
317 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
318
319 %r1 = load volatile float, float addrspace(1)* %gep.0
320 %r2 = load volatile float, float addrspace(1)* %gep.1
321
322 %r2.fneg = fsub float -0.000000e+00, %r2
323
324 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
325 store float %r3, float addrspace(1)* %gep.out
326 ret void
327}
328
329; GCN-LABEL: {{^}}mad_sub_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +0000330; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
331; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
332; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000333; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
334
335; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
336
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000337; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
338; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000339
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000340; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
341; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000342
343; SI: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000344; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000345define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000346 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
347 %tid.ext = sext i32 %tid to i64
348 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
349 %add1 = add i64 %tid.ext, 1
350 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
351 %add2 = add i64 %tid.ext, 2
352 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
353 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
354 %a = load volatile float, float addrspace(1)* %gep0, align 4
355 %b = load volatile float, float addrspace(1)* %gep1, align 4
356 %c = load volatile float, float addrspace(1)* %gep2, align 4
357 %mul = fmul float %a, %b
358 %sub = fsub float %mul, %c
359 store float %sub, float addrspace(1)* %outgep, align 4
360 ret void
361}
362
363; GCN-LABEL: {{^}}mad_sub_inv_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +0000364; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
365; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
366; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000367
368; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
369
370; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
371
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000372; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
373; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000374
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000375; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
376; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000377
378; SI: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000379; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000380define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000381 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
382 %tid.ext = sext i32 %tid to i64
383 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
384 %add1 = add i64 %tid.ext, 1
385 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
386 %add2 = add i64 %tid.ext, 2
387 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
388 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
389 %a = load volatile float, float addrspace(1)* %gep0, align 4
390 %b = load volatile float, float addrspace(1)* %gep1, align 4
391 %c = load volatile float, float addrspace(1)* %gep2, align 4
392 %mul = fmul float %a, %b
393 %sub = fsub float %c, %mul
394 store float %sub, float addrspace(1)* %outgep, align 4
395 ret void
396}
397
398; GCN-LABEL: {{^}}mad_sub_fabs_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +0000399; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
400; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
401; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000402; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
403
404; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
405
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000406; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000407; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
408
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000409; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000410; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
411
412; SI: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000413; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000414define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000415 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
416 %tid.ext = sext i32 %tid to i64
417 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
418 %add1 = add i64 %tid.ext, 1
419 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
420 %add2 = add i64 %tid.ext, 2
421 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
422 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
423 %a = load volatile float, float addrspace(1)* %gep0, align 4
424 %b = load volatile float, float addrspace(1)* %gep1, align 4
425 %c = load volatile float, float addrspace(1)* %gep2, align 4
426 %c.abs = call float @llvm.fabs.f32(float %c) #0
427 %mul = fmul float %a, %b
428 %sub = fsub float %mul, %c.abs
429 store float %sub, float addrspace(1)* %outgep, align 4
430 ret void
431}
432
433; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +0000434; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
435; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
436; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
437; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
438; GCN-FLUSH-FMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
Matt Arsenault770ec862016-12-22 03:55:35 +0000439
440; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
441
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000442; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000443; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
444
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000445; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000446; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
447
448; SI: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000449; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000450define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000451 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
452 %tid.ext = sext i32 %tid to i64
453 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
454 %add1 = add i64 %tid.ext, 1
455 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
456 %add2 = add i64 %tid.ext, 2
457 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
458 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
459 %a = load volatile float, float addrspace(1)* %gep0, align 4
460 %b = load volatile float, float addrspace(1)* %gep1, align 4
461 %c = load volatile float, float addrspace(1)* %gep2, align 4
462 %c.abs = call float @llvm.fabs.f32(float %c) #0
463 %mul = fmul float %a, %b
464 %sub = fsub float %c.abs, %mul
465 store float %sub, float addrspace(1)* %outgep, align 4
466 ret void
467}
468
469; GCN-LABEL: {{^}}neg_neg_mad_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +0000470; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
471; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
472; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000473
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000474; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000475; SI-FLUSH: buffer_store_dword [[REGC]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000476; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000477
478; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
479
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000480; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
481; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000482
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000483; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
484; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000485
486; SI-DENORM: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000487; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000488define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000489 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
490 %tid.ext = sext i32 %tid to i64
491 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
492 %add1 = add i64 %tid.ext, 1
493 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
494 %add2 = add i64 %tid.ext, 2
495 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
496 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
497 %a = load volatile float, float addrspace(1)* %gep0, align 4
498 %b = load volatile float, float addrspace(1)* %gep1, align 4
499 %c = load volatile float, float addrspace(1)* %gep2, align 4
500 %nega = fsub float -0.000000e+00, %a
501 %negb = fsub float -0.000000e+00, %b
502 %mul = fmul float %nega, %negb
503 %sub = fadd float %mul, %c
504 store float %sub, float addrspace(1)* %outgep, align 4
505 ret void
506}
507
508; GCN-LABEL: {{^}}mad_fabs_sub_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +0000509; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
510; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
511; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000512; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
513
514; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
515
516; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000517; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000518
519; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000520; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000521
522; SI: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000523; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000524define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000525 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
526 %tid.ext = sext i32 %tid to i64
527 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
528 %add1 = add i64 %tid.ext, 1
529 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
530 %add2 = add i64 %tid.ext, 2
531 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
532 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
533 %a = load volatile float, float addrspace(1)* %gep0, align 4
534 %b = load volatile float, float addrspace(1)* %gep1, align 4
535 %c = load volatile float, float addrspace(1)* %gep2, align 4
536 %b.abs = call float @llvm.fabs.f32(float %b) #0
537 %mul = fmul float %a, %b.abs
538 %sub = fsub float %mul, %c
539 store float %sub, float addrspace(1)* %outgep, align 4
540 ret void
541}
542
543; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +0000544; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
545; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
Matt Arsenault770ec862016-12-22 03:55:35 +0000546; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
547; SI-FLUSH: buffer_store_dword [[R2]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000548; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000549
550; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
551
552; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000553; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000554
555; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000556; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000557
558; SI-DENORM: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000559; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000560define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000561 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
562 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
563 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
564 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
565
566 %r1 = load volatile float, float addrspace(1)* %gep.0
567 %r2 = load volatile float, float addrspace(1)* %gep.1
568
569 %add = fadd float %r1, %r1
570 %r3 = fsub float %r2, %add
571
572 store float %r3, float addrspace(1)* %gep.out
573 ret void
574}
575
576; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32:
Matt Arsenault0084adc2018-04-30 19:08:16 +0000577; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
578; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
Matt Arsenaulte7d8ed32016-12-22 04:03:40 +0000579; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000580
581; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
582
583; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000584; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000585
586; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000587; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
Matt Arsenault770ec862016-12-22 03:55:35 +0000588
589; SI: buffer_store_dword [[RESULT]]
Matt Arsenault0084adc2018-04-30 19:08:16 +0000590; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000591define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Matt Arsenault770ec862016-12-22 03:55:35 +0000592 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
593 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
594 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
595 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
596
597 %r1 = load volatile float, float addrspace(1)* %gep.0
598 %r2 = load volatile float, float addrspace(1)* %gep.1
599
600 %add = fadd float %r1, %r1
601 %r3 = fsub float %add, %r2
602
603 store float %r3, float addrspace(1)* %gep.out
604 ret void
605}
606
607attributes #0 = { nounwind }
608attributes #1 = { nounwind readnone }