blob: 8a6bf853a7c6acd9219ae3644343f435d877e670 [file] [log] [blame]
Matt Arsenault8d630032015-02-20 22:10:41 +00001; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
2
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +00003; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
4; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
5; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s
Matt Arsenault8d630032015-02-20 22:10:41 +00006
7; Make sure we don't form mad with denormals
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +00008; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s
9; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
Matt Arsenault8d630032015-02-20 22:10:41 +000010
Matt Arsenault9c47dd52016-02-11 06:02:01 +000011declare i32 @llvm.amdgcn.workitem.id.x() #0
Matt Arsenault8d630032015-02-20 22:10:41 +000012declare float @llvm.fabs.f32(float) #0
13declare float @llvm.fma.f32(float, float, float) #0
14declare float @llvm.fmuladd.f32(float, float, float) #0
15
16; (fadd (fmul x, y), z) -> (fma x, y, z)
17; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
18; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
19; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
20; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
21
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000022; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
Matt Arsenault8d630032015-02-20 22:10:41 +000023
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +000024; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +000025
26; SI-DENORM-SLOWFMAF-NOT: v_fma
27; SI-DENORM-SLOWFMAF-NOT: v_mad
28
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000029; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
30; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +000031
Tom Stellarddb5a11f2015-07-13 15:47:57 +000032; SI-DENORM: buffer_store_dword [[RESULT]]
33; SI-STD: buffer_store_dword [[C]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000034define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +000035 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +000036 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
37 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
38 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
39 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
Matt Arsenault8d630032015-02-20 22:10:41 +000040
Matt Arsenault44e54832016-04-12 13:38:18 +000041 %a = load volatile float, float addrspace(1)* %gep.0
42 %b = load volatile float, float addrspace(1)* %gep.1
43 %c = load volatile float, float addrspace(1)* %gep.2
Matt Arsenault8d630032015-02-20 22:10:41 +000044
45 %mul = fmul float %a, %b
46 %fma = fadd float %mul, %c
47 store float %fma, float addrspace(1)* %gep.out
48 ret void
49}
50
51; (fadd (fmul x, y), z) -> (fma x, y, z)
52; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
53; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
54; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
55; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
56; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
57
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000058; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]]
59; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]]
Matt Arsenault8d630032015-02-20 22:10:41 +000060
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +000061; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
62; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
Matt Arsenault8d630032015-02-20 22:10:41 +000063
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000064; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
65; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
66; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
Matt Arsenault8d630032015-02-20 22:10:41 +000067
Tom Stellarddb5a11f2015-07-13 15:47:57 +000068; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
69; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
70; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
71; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
Matt Arsenault8d630032015-02-20 22:10:41 +000072; SI: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000073define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +000074 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +000075 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
76 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
77 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
78 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
79 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
80 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
Matt Arsenault8d630032015-02-20 22:10:41 +000081
Matt Arsenault44e54832016-04-12 13:38:18 +000082 %a = load volatile float, float addrspace(1)* %gep.0
83 %b = load volatile float, float addrspace(1)* %gep.1
84 %c = load volatile float, float addrspace(1)* %gep.2
85 %d = load volatile float, float addrspace(1)* %gep.3
Matt Arsenault8d630032015-02-20 22:10:41 +000086
87 %mul = fmul float %a, %b
88 %fma0 = fadd float %mul, %c
89 %fma1 = fadd float %mul, %d
90
Matt Arsenault44e54832016-04-12 13:38:18 +000091 store volatile float %fma0, float addrspace(1)* %gep.out.0
92 store volatile float %fma1, float addrspace(1)* %gep.out.1
Matt Arsenault8d630032015-02-20 22:10:41 +000093 ret void
94}
95
96; (fadd x, (fmul y, z)) -> (fma y, z, x)
97; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
98; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
99; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
100; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
101
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000102; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000103; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000104
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000105; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
106; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000107
Tom Stellarddb5a11f2015-07-13 15:47:57 +0000108; SI-DENORM: buffer_store_dword [[RESULT]]
109; SI-STD: buffer_store_dword [[C]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000110define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000111 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +0000112 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
113 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
114 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
115 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
Matt Arsenault8d630032015-02-20 22:10:41 +0000116
Matt Arsenault44e54832016-04-12 13:38:18 +0000117 %a = load volatile float, float addrspace(1)* %gep.0
118 %b = load volatile float, float addrspace(1)* %gep.1
119 %c = load volatile float, float addrspace(1)* %gep.2
Matt Arsenault8d630032015-02-20 22:10:41 +0000120
121 %mul = fmul float %a, %b
122 %fma = fadd float %c, %mul
123 store float %fma, float addrspace(1)* %gep.out
124 ret void
125}
126
127; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
128; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
129; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
130; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
131; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
132
133; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000134; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000135
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000136; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
137; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000138
139; SI: buffer_store_dword [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000140define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000141 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +0000142 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
143 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
144 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
145 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
Matt Arsenault8d630032015-02-20 22:10:41 +0000146
Matt Arsenault44e54832016-04-12 13:38:18 +0000147 %a = load volatile float, float addrspace(1)* %gep.0
148 %b = load volatile float, float addrspace(1)* %gep.1
149 %c = load volatile float, float addrspace(1)* %gep.2
Matt Arsenault8d630032015-02-20 22:10:41 +0000150
151 %mul = fmul float %a, %b
152 %fma = fsub float %mul, %c
153 store float %fma, float addrspace(1)* %gep.out
154 ret void
155}
156
157; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
158; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
159; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
160; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
161; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
162; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
163
164; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
165; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
166
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000167; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
168; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000169
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000170; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
171; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
172; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000173
174; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
175; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
176; SI: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000177define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000178 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +0000179 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
180 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
181 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
182 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
183 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
184 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
Matt Arsenault8d630032015-02-20 22:10:41 +0000185
Matt Arsenault44e54832016-04-12 13:38:18 +0000186 %a = load volatile float, float addrspace(1)* %gep.0
187 %b = load volatile float, float addrspace(1)* %gep.1
188 %c = load volatile float, float addrspace(1)* %gep.2
189 %d = load volatile float, float addrspace(1)* %gep.3
Matt Arsenault8d630032015-02-20 22:10:41 +0000190
191 %mul = fmul float %a, %b
192 %fma0 = fsub float %mul, %c
193 %fma1 = fsub float %mul, %d
Matt Arsenault44e54832016-04-12 13:38:18 +0000194 store volatile float %fma0, float addrspace(1)* %gep.out.0
195 store volatile float %fma1, float addrspace(1)* %gep.out.1
Matt Arsenault8d630032015-02-20 22:10:41 +0000196 ret void
197}
198
199; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
200; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
201; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
202; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
203; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
204
205; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000206; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000207
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000208; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
209; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000210
211; SI: buffer_store_dword [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000212define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000213 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +0000214 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
215 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
216 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
217 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
Matt Arsenault8d630032015-02-20 22:10:41 +0000218
Matt Arsenault44e54832016-04-12 13:38:18 +0000219 %a = load volatile float, float addrspace(1)* %gep.0
220 %b = load volatile float, float addrspace(1)* %gep.1
221 %c = load volatile float, float addrspace(1)* %gep.2
Matt Arsenault8d630032015-02-20 22:10:41 +0000222
223 %mul = fmul float %a, %b
224 %fma = fsub float %c, %mul
225 store float %fma, float addrspace(1)* %gep.out
226 ret void
227}
228
229; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
230; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
231; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
232; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
233; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
234
235; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
236; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
237
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000238; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
239; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000240
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000241; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
242; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
243; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000244
245; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
246; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
247; SI: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000248define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000249 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +0000250 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
251 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
252 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
253 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
254 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
255 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
Matt Arsenault8d630032015-02-20 22:10:41 +0000256
Matt Arsenault44e54832016-04-12 13:38:18 +0000257 %a = load volatile float, float addrspace(1)* %gep.0
258 %b = load volatile float, float addrspace(1)* %gep.1
259 %c = load volatile float, float addrspace(1)* %gep.2
260 %d = load volatile float, float addrspace(1)* %gep.3
Matt Arsenault8d630032015-02-20 22:10:41 +0000261
262 %mul = fmul float %a, %b
263 %fma0 = fsub float %c, %mul
264 %fma1 = fsub float %d, %mul
Matt Arsenault44e54832016-04-12 13:38:18 +0000265 store volatile float %fma0, float addrspace(1)* %gep.out.0
266 store volatile float %fma1, float addrspace(1)* %gep.out.1
Matt Arsenault8d630032015-02-20 22:10:41 +0000267 ret void
268}
269
270; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
271; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
272; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
273; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
274; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
275
Matt Arsenault4103a812017-01-12 00:23:20 +0000276; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000277
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000278; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000279
Matt Arsenault4103a812017-01-12 00:23:20 +0000280; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000281; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000282
283; SI: buffer_store_dword [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000284define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000285 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +0000286 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
287 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
288 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
289 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
Matt Arsenault8d630032015-02-20 22:10:41 +0000290
Matt Arsenault44e54832016-04-12 13:38:18 +0000291 %a = load volatile float, float addrspace(1)* %gep.0
292 %b = load volatile float, float addrspace(1)* %gep.1
293 %c = load volatile float, float addrspace(1)* %gep.2
Matt Arsenault8d630032015-02-20 22:10:41 +0000294
295 %mul = fmul float %a, %b
296 %mul.neg = fsub float -0.0, %mul
297 %fma = fsub float %mul.neg, %c
298
299 store float %fma, float addrspace(1)* %gep.out
300 ret void
301}
302
303; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
304; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
305; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
306; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
307; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
308
Matt Arsenault4103a812017-01-12 00:23:20 +0000309; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]]
310; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000311
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000312; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
313; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000314
Matt Arsenault4103a812017-01-12 00:23:20 +0000315; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000316; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
317; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000318
319; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
320; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
321; SI: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000322define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000323 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +0000324 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
325 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
326 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
327 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
328 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
329 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
Matt Arsenault8d630032015-02-20 22:10:41 +0000330
Matt Arsenault44e54832016-04-12 13:38:18 +0000331 %a = load volatile float, float addrspace(1)* %gep.0
332 %b = load volatile float, float addrspace(1)* %gep.1
333 %c = load volatile float, float addrspace(1)* %gep.2
334 %d = load volatile float, float addrspace(1)* %gep.3
Matt Arsenault8d630032015-02-20 22:10:41 +0000335
336 %mul = fmul float %a, %b
337 %mul.neg = fsub float -0.0, %mul
338 %fma0 = fsub float %mul.neg, %c
339 %fma1 = fsub float %mul.neg, %d
340
Matt Arsenault44e54832016-04-12 13:38:18 +0000341 store volatile float %fma0, float addrspace(1)* %gep.out.0
342 store volatile float %fma1, float addrspace(1)* %gep.out.1
Matt Arsenault8d630032015-02-20 22:10:41 +0000343 ret void
344}
345
346; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
347; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
348; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
349; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
350; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
351
352; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
353; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
354
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000355; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
356; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000357
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000358; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000359; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000360; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000361
362; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
363; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
364; SI: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000365define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000366 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +0000367 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
368 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
369 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
370 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
371 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
372 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
Matt Arsenault8d630032015-02-20 22:10:41 +0000373
Matt Arsenault44e54832016-04-12 13:38:18 +0000374 %a = load volatile float, float addrspace(1)* %gep.0
375 %b = load volatile float, float addrspace(1)* %gep.1
376 %c = load volatile float, float addrspace(1)* %gep.2
377 %d = load volatile float, float addrspace(1)* %gep.3
Matt Arsenault8d630032015-02-20 22:10:41 +0000378
379 %mul = fmul float %a, %b
380 %mul.neg = fsub float -0.0, %mul
381 %fma0 = fsub float %mul.neg, %c
382 %fma1 = fsub float %mul, %d
383
Matt Arsenault44e54832016-04-12 13:38:18 +0000384 store volatile float %fma0, float addrspace(1)* %gep.out.0
385 store volatile float %fma1, float addrspace(1)* %gep.out.1
Matt Arsenault8d630032015-02-20 22:10:41 +0000386 ret void
387}
388
389; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
390
391; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
392; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
393; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
394; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
395; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
396; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
397
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000398; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000399; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000400; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000401
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000402; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000403; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000404; SI-DENORM: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP1]], [[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000405
406; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000407define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000408 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +0000409 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
410 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
411 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
412 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
413 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
414 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
Matt Arsenault8d630032015-02-20 22:10:41 +0000415
Matt Arsenault44e54832016-04-12 13:38:18 +0000416 %x = load volatile float, float addrspace(1)* %gep.0
417 %y = load volatile float, float addrspace(1)* %gep.1
418 %z = load volatile float, float addrspace(1)* %gep.2
419 %u = load volatile float, float addrspace(1)* %gep.3
420 %v = load volatile float, float addrspace(1)* %gep.4
Matt Arsenault8d630032015-02-20 22:10:41 +0000421
422 %tmp0 = fmul float %u, %v
423 %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
424 %tmp2 = fsub float %tmp1, %z
425
426 store float %tmp2, float addrspace(1)* %gep.out
427 ret void
428}
429
430; fold (fsub x, (fma y, z, (fmul u, v)))
431; -> (fma (fneg y), z, (fma (fneg u), v, x))
432
433; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
434; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
435; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
436; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
437; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
438; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
439
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000440; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000441; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000442; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000443
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000444; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000445; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000446; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000447
448; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
449; SI: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000450define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000451 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +0000452 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
453 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
454 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
455 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
456 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
457 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
Matt Arsenault8d630032015-02-20 22:10:41 +0000458
Matt Arsenault44e54832016-04-12 13:38:18 +0000459 %x = load volatile float, float addrspace(1)* %gep.0
460 %y = load volatile float, float addrspace(1)* %gep.1
461 %z = load volatile float, float addrspace(1)* %gep.2
462 %u = load volatile float, float addrspace(1)* %gep.3
463 %v = load volatile float, float addrspace(1)* %gep.4
Matt Arsenault8d630032015-02-20 22:10:41 +0000464
465 %tmp0 = fmul float %u, %v
466 %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
467 %tmp2 = fsub float %x, %tmp1
468
469 store float %tmp2, float addrspace(1)* %gep.out
470 ret void
471}
472
473; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
474
475; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
476; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
477; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
478; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
479; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
480; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
481
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000482; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
483; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]]
484; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000485
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000486; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000487; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]]
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000488
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000489; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000490; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000491; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000492
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000493; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
494; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]]
495; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
496; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000497
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000498; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
Matt Arsenault8d630032015-02-20 22:10:41 +0000499; SI: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000500define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000501 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +0000502 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
503 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
504 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
505 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
506 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
507 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
Matt Arsenault8d630032015-02-20 22:10:41 +0000508
Matt Arsenault44e54832016-04-12 13:38:18 +0000509 %x = load volatile float, float addrspace(1)* %gep.0
510 %y = load volatile float, float addrspace(1)* %gep.1
511 %z = load volatile float, float addrspace(1)* %gep.2
512 %u = load volatile float, float addrspace(1)* %gep.3
513 %v = load volatile float, float addrspace(1)* %gep.4
Matt Arsenault8d630032015-02-20 22:10:41 +0000514
515 %tmp0 = fmul float %u, %v
516 %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
517 %tmp2 = fsub float %tmp1, %z
518
519 store float %tmp2, float addrspace(1)* %gep.out
520 ret void
521}
522
523; fold (fsub x, (fmuladd y, z, (fmul u, v)))
524; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
525
526; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
527; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
528; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
529; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
530; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
531; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
532
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000533; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
534; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]]
535; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000536
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000537; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
538; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
539
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000540; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
Nicolai Haehnle8813d5d2017-01-31 14:35:37 +0000541; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000542; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000543
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000544; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
545; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]]
546; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
547; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]]
Matt Arsenault8d630032015-02-20 22:10:41 +0000548
549; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
550; SI: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000551define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000552 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
David Blaikie79e6c742015-02-27 19:29:02 +0000553 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
554 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
555 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
556 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
557 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
558 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
Matt Arsenault8d630032015-02-20 22:10:41 +0000559
Matt Arsenault44e54832016-04-12 13:38:18 +0000560 %x = load volatile float, float addrspace(1)* %gep.0
561 %y = load volatile float, float addrspace(1)* %gep.1
562 %z = load volatile float, float addrspace(1)* %gep.2
563 %u = load volatile float, float addrspace(1)* %gep.3
564 %v = load volatile float, float addrspace(1)* %gep.4
Matt Arsenault8d630032015-02-20 22:10:41 +0000565
566 %tmp0 = fmul float %u, %v
567 %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
568 %tmp2 = fsub float %x, %tmp1
569
570 store float %tmp2, float addrspace(1)* %gep.out
571 ret void
572}
573
574attributes #0 = { nounwind readnone }
575attributes #1 = { nounwind }