blob: 3c70691a583fdbc56ac4cfcd872220ca6d60b908 [file] [log] [blame]
Matt Arsenault6b114d22017-08-30 01:20:17 +00001; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00004
5; GCN-LABEL: {{^}}v_clamp_f32:
Matt Arsenault6b114d22017-08-30 01:20:17 +00006; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +00007; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
8define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
9 %tid = call i32 @llvm.amdgcn.workitem.id.x()
10 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
11 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
12 %a = load float, float addrspace(1)* %gep0
13 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
14 %med = call float @llvm.minnum.f32(float %max, float 1.0)
15
16 store float %med, float addrspace(1)* %out.gep
17 ret void
18}
19
20; GCN-LABEL: {{^}}v_clamp_neg_f32:
Matt Arsenault6b114d22017-08-30 01:20:17 +000021; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +000022; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
23define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
24 %tid = call i32 @llvm.amdgcn.workitem.id.x()
25 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
26 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
27 %a = load float, float addrspace(1)* %gep0
28 %fneg.a = fsub float -0.0, %a
29 %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
30 %med = call float @llvm.minnum.f32(float %max, float 1.0)
31
32 store float %med, float addrspace(1)* %out.gep
33 ret void
34}
35
36; GCN-LABEL: {{^}}v_clamp_negabs_f32:
Matt Arsenault6b114d22017-08-30 01:20:17 +000037; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +000038; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
39define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
40 %tid = call i32 @llvm.amdgcn.workitem.id.x()
41 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
42 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
43 %a = load float, float addrspace(1)* %gep0
44 %fabs.a = call float @llvm.fabs.f32(float %a)
45 %fneg.fabs.a = fsub float -0.0, %fabs.a
46
47 %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
48 %med = call float @llvm.minnum.f32(float %max, float 1.0)
49
50 store float %med, float addrspace(1)* %out.gep
51 ret void
52}
53
54; GCN-LABEL: {{^}}v_clamp_negzero_f32:
Matt Arsenault6b114d22017-08-30 01:20:17 +000055; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenaultc3dc8e62018-08-03 18:27:52 +000056; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]]
Stanislav Mekhanoshin79da2a72017-03-11 00:29:27 +000057; GCN-DAG: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1
Matt Arsenaultc3dc8e62018-08-03 18:27:52 +000058; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[SIGNBIT]], 1.0
Matt Arsenault2fdf2a12017-02-21 23:35:48 +000059define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
60 %tid = call i32 @llvm.amdgcn.workitem.id.x()
61 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
62 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
63 %a = load float, float addrspace(1)* %gep0
Matt Arsenaultc3dc8e62018-08-03 18:27:52 +000064 %add = fadd nnan float %a, 0.5
65 %max = call float @llvm.maxnum.f32(float %add, float -0.0)
66 %med = call float @llvm.minnum.f32(float %max, float 1.0)
67
68 store float %med, float addrspace(1)* %out.gep
69 ret void
70}
71
72; FIXME: Weird inconsistency in how -0.0 is treated. Accepted if clamp
73; matched through med3, not if directly. Is this correct?
74
75; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32:
76; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
77; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[A]]
78; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
79define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
80 %tid = call i32 @llvm.amdgcn.workitem.id.x()
81 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
82 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
83 %a = load float, float addrspace(1)* %gep0
Matt Arsenault2fdf2a12017-02-21 23:35:48 +000084 %max = call float @llvm.maxnum.f32(float %a, float -0.0)
85 %med = call float @llvm.minnum.f32(float %max, float 1.0)
86
87 store float %med, float addrspace(1)* %out.gep
88 ret void
89}
90
91; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
Matt Arsenault6b114d22017-08-30 01:20:17 +000092; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +000093; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
94; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
95define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
96 %tid = call i32 @llvm.amdgcn.workitem.id.x()
97 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
98 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
99 %a = load float, float addrspace(1)* %gep0
100 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
101 %med = call float @llvm.minnum.f32(float %max, float 1.0)
102
103 store float %med, float addrspace(1)* %out.gep
104 store volatile float %max, float addrspace(1)* undef
105 ret void
106}
107
108; GCN-LABEL: {{^}}v_clamp_f16:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000109; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
110; GFX89: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000111
Matt Arsenaultd5c65152017-02-22 23:27:53 +0000112; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}}
113; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000114define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
115 %tid = call i32 @llvm.amdgcn.workitem.id.x()
116 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
117 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
118 %a = load half, half addrspace(1)* %gep0
119 %max = call half @llvm.maxnum.f16(half %a, half 0.0)
120 %med = call half @llvm.minnum.f16(half %max, half 1.0)
121
122 store half %med, half addrspace(1)* %out.gep
123 ret void
124}
125
126; GCN-LABEL: {{^}}v_clamp_neg_f16:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000127; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
128; GFX89: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000129
130; FIXME: Better to fold neg into max
Matt Arsenaultd5c65152017-02-22 23:27:53 +0000131; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}}
132; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000133define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
134 %tid = call i32 @llvm.amdgcn.workitem.id.x()
135 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
136 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
137 %a = load half, half addrspace(1)* %gep0
138 %fneg.a = fsub half -0.0, %a
139 %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
140 %med = call half @llvm.minnum.f16(half %max, half 1.0)
141
142 store half %med, half addrspace(1)* %out.gep
143 ret void
144}
145
146; GCN-LABEL: {{^}}v_clamp_negabs_f16:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000147; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
148; GFX89: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000149
150; FIXME: Better to fold neg/abs into max
151
Matt Arsenaultd5c65152017-02-22 23:27:53 +0000152; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}}
153; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000154define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
155 %tid = call i32 @llvm.amdgcn.workitem.id.x()
156 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
157 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
158 %a = load half, half addrspace(1)* %gep0
159 %fabs.a = call half @llvm.fabs.f16(half %a)
160 %fneg.fabs.a = fsub half -0.0, %fabs.a
161
162 %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
163 %med = call half @llvm.minnum.f16(half %max, half 1.0)
164
165 store half %med, half addrspace(1)* %out.gep
166 ret void
167}
168
169; FIXME: Do f64 instructions support clamp?
170; GCN-LABEL: {{^}}v_clamp_f64:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000171; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
Matt Arsenault79a45db2017-02-22 23:53:37 +0000172; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}}
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000173define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
174 %tid = call i32 @llvm.amdgcn.workitem.id.x()
175 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
176 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
177 %a = load double, double addrspace(1)* %gep0
178 %max = call double @llvm.maxnum.f64(double %a, double 0.0)
179 %med = call double @llvm.minnum.f64(double %max, double 1.0)
180
181 store double %med, double addrspace(1)* %out.gep
182 ret void
183}
184
185; GCN-LABEL: {{^}}v_clamp_neg_f64:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000186; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
Matt Arsenault79a45db2017-02-22 23:53:37 +0000187; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}}
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000188define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
189 %tid = call i32 @llvm.amdgcn.workitem.id.x()
190 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
191 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
192 %a = load double, double addrspace(1)* %gep0
193 %fneg.a = fsub double -0.0, %a
194 %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
195 %med = call double @llvm.minnum.f64(double %max, double 1.0)
196
197 store double %med, double addrspace(1)* %out.gep
198 ret void
199}
200
201; GCN-LABEL: {{^}}v_clamp_negabs_f64:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000202; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
Matt Arsenault79a45db2017-02-22 23:53:37 +0000203; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}}
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000204define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
205 %tid = call i32 @llvm.amdgcn.workitem.id.x()
206 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
207 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
208 %a = load double, double addrspace(1)* %gep0
209 %fabs.a = call double @llvm.fabs.f64(double %a)
210 %fneg.fabs.a = fsub double -0.0, %fabs.a
211
212 %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
213 %med = call double @llvm.minnum.f64(double %max, double 1.0)
214
215 store double %med, double addrspace(1)* %out.gep
216 ret void
217}
218
219; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000220; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000221; GCN: v_med3_f32
222define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
223 %tid = call i32 @llvm.amdgcn.workitem.id.x()
224 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
225 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
226 %a = load float, float addrspace(1)* %gep0
227 %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
228 store float %med, float addrspace(1)* %out.gep
229 ret void
230}
231
232; GCN-LABEL: {{^}}v_clamp_med3_aby_f32:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000233; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000234; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
235define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
236 %tid = call i32 @llvm.amdgcn.workitem.id.x()
237 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
238 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
239 %a = load float, float addrspace(1)* %gep0
240 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
241 store float %med, float addrspace(1)* %out.gep
242 ret void
243}
244
245; GCN-LABEL: {{^}}v_clamp_med3_bay_f32:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000246; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000247; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
248define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
249 %tid = call i32 @llvm.amdgcn.workitem.id.x()
250 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
251 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
252 %a = load float, float addrspace(1)* %gep0
253 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
254 store float %med, float addrspace(1)* %out.gep
255 ret void
256}
257
258; GCN-LABEL: {{^}}v_clamp_med3_yab_f32:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000259; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000260; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
261define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
262 %tid = call i32 @llvm.amdgcn.workitem.id.x()
263 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
264 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
265 %a = load float, float addrspace(1)* %gep0
266 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
267 store float %med, float addrspace(1)* %out.gep
268 ret void
269}
270
271; GCN-LABEL: {{^}}v_clamp_med3_yba_f32:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000272; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000273; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
274define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
275 %tid = call i32 @llvm.amdgcn.workitem.id.x()
276 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
277 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
278 %a = load float, float addrspace(1)* %gep0
279 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
280 store float %med, float addrspace(1)* %out.gep
281 ret void
282}
283
284; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000285; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000286; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
287define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
288 %tid = call i32 @llvm.amdgcn.workitem.id.x()
289 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
290 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
291 %a = load float, float addrspace(1)* %gep0
292 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
293 store float %med, float addrspace(1)* %out.gep
294 ret void
295}
296
297; GCN-LABEL: {{^}}v_clamp_med3_bya_f32:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000298; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000299; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
300define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
301 %tid = call i32 @llvm.amdgcn.workitem.id.x()
302 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
303 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
304 %a = load float, float addrspace(1)* %gep0
305 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
306 store float %med, float addrspace(1)* %out.gep
307 ret void
308}
309
310; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32:
311; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0
312define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 {
313 %tid = call i32 @llvm.amdgcn.workitem.id.x()
314 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
315 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
316 store float %med, float addrspace(1)* %out.gep
317 ret void
318}
319
320; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32:
321; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
322define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 {
323 %tid = call i32 @llvm.amdgcn.workitem.id.x()
324 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
325 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
326 store float %med, float addrspace(1)* %out.gep
327 ret void
328}
329
330; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32:
331; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5
332define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 {
333 %tid = call i32 @llvm.amdgcn.workitem.id.x()
334 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
335 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
336 store float %med, float addrspace(1)* %out.gep
337 ret void
338}
339
340; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32:
341; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}}
342define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 {
343 %tid = call i32 @llvm.amdgcn.workitem.id.x()
344 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
345 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
346 store float %med, float addrspace(1)* %out.gep
347 ret void
348}
349
350; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32:
351; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
352define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 {
353 %tid = call i32 @llvm.amdgcn.workitem.id.x()
354 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
355 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
356 store float %med, float addrspace(1)* %out.gep
357 ret void
358}
359
360; GCN-LABEL: {{^}}v_clamp_constant_snan_f32:
361; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
362define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 {
363 %tid = call i32 @llvm.amdgcn.workitem.id.x()
364 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
365 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
366 store float %med, float addrspace(1)* %out.gep
367 ret void
368}
369
370; ---------------------------------------------------------------------
371; Test non-default behaviors enabling snans and disabling dx10_clamp
372; ---------------------------------------------------------------------
373
374; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000375; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenaultc3dc8e62018-08-03 18:27:52 +0000376; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]]
377; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000378define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
379 %tid = call i32 @llvm.amdgcn.workitem.id.x()
380 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
381 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
382 %a = load float, float addrspace(1)* %gep0
Matt Arsenaultc3dc8e62018-08-03 18:27:52 +0000383 %a.nnan = fadd nnan float %a, 0.5
384 %max = call float @llvm.maxnum.f32(float %a.nnan, float 0.0)
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000385 %med = call float @llvm.minnum.f32(float %max, float 1.0)
386
387 store float %med, float addrspace(1)* %out.gep
388 ret void
389}
390
391; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000392; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenaultc3dc8e62018-08-03 18:27:52 +0000393; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 0.5 clamp{{$}}
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000394define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 {
395 %tid = call i32 @llvm.amdgcn.workitem.id.x()
396 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
397 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
398 %a = load float, float addrspace(1)* %gep0
Matt Arsenaultc3dc8e62018-08-03 18:27:52 +0000399 %add = fadd float %a, 0.5
400 %max = call float @llvm.maxnum.f32(float %add, float 0.0)
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000401 %med = call float @llvm.minnum.f32(float %max, float 1.0)
402
403 store float %med, float addrspace(1)* %out.gep
404 ret void
405}
406
407; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000408; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000409; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
410; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
411define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
412 %tid = call i32 @llvm.amdgcn.workitem.id.x()
413 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
414 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
415 %a = load float, float addrspace(1)* %gep0
416 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
417 %med = call float @llvm.minnum.f32(float %max, float 1.0)
418
419 store float %med, float addrspace(1)* %out.gep
420 ret void
421}
422
423; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000424; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault301162c2017-11-15 21:51:43 +0000425; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]
426; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000427define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
428 %tid = call i32 @llvm.amdgcn.workitem.id.x()
429 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
430 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
431 %a = load float, float addrspace(1)* %gep0
432 %add = fadd nnan float %a, 1.0
433 %max = call float @llvm.maxnum.f32(float %add, float 0.0)
434 %med = call float @llvm.minnum.f32(float %max, float 1.0)
435
436 store float %med, float addrspace(1)* %out.gep
437 ret void
438}
439
440; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000441; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000442; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
443define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
444 %tid = call i32 @llvm.amdgcn.workitem.id.x()
445 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
446 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
447 %a = load float, float addrspace(1)* %gep0
448 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
449 store float %med, float addrspace(1)* %out.gep
450 ret void
451}
452
453; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000454; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000455; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
456define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
457 %tid = call i32 @llvm.amdgcn.workitem.id.x()
458 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
459 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
460 %a = load float, float addrspace(1)* %gep0
461 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
462 store float %med, float addrspace(1)* %out.gep
463 ret void
464}
465
466; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000467; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000468; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
469define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
470 %tid = call i32 @llvm.amdgcn.workitem.id.x()
471 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
472 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
473 %a = load float, float addrspace(1)* %gep0
474 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
475 store float %med, float addrspace(1)* %out.gep
476 ret void
477}
478
479; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000480; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000481; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0
482define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
483 %tid = call i32 @llvm.amdgcn.workitem.id.x()
484 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
485 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
486 %a = load float, float addrspace(1)* %gep0
487 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
488 store float %med, float addrspace(1)* %out.gep
489 ret void
490}
491
492; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000493; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000494; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0
495define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
496 %tid = call i32 @llvm.amdgcn.workitem.id.x()
497 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
498 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
499 %a = load float, float addrspace(1)* %gep0
500 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
501 store float %med, float addrspace(1)* %out.gep
502 ret void
503}
504
505; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp:
Matt Arsenault6b114d22017-08-30 01:20:17 +0000506; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000507; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0
508define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
509 %tid = call i32 @llvm.amdgcn.workitem.id.x()
510 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
511 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
512 %a = load float, float addrspace(1)* %gep0
513 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
514 store float %med, float addrspace(1)* %out.gep
515 ret void
516}
517
518; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp:
519; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000
520define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
521 %tid = call i32 @llvm.amdgcn.workitem.id.x()
522 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
523 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
524 store float %med, float addrspace(1)* %out.gep
525 ret void
526}
527
528; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp:
529; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001
530define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
531 %tid = call i32 @llvm.amdgcn.workitem.id.x()
532 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
533 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
534 store float %med, float addrspace(1)* %out.gep
535 ret void
536}
537
Matt Arsenault6b114d22017-08-30 01:20:17 +0000538; GCN-LABEL: {{^}}v_clamp_v2f16:
539; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
540; GFX9-NOT: [[A]]
541; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
542define amdgpu_kernel void @v_clamp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
543 %tid = call i32 @llvm.amdgcn.workitem.id.x()
544 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
545 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
546 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
547 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer)
548 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
549
550 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
551 ret void
552}
553
554; GCN-LABEL: {{^}}v_clamp_v2f16_undef_elt:
555; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
556; GFX9-NOT: [[A]]
557; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
558define amdgpu_kernel void @v_clamp_v2f16_undef_elt(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
559 %tid = call i32 @llvm.amdgcn.workitem.id.x()
560 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
561 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
562 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
563 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
564 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
565
566 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
567 ret void
568}
569
570; GCN-LABEL: {{^}}v_clamp_v2f16_not_zero:
571; GFX9: v_pk_max_f16
572; GFX9: v_pk_min_f16
573define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
574 %tid = call i32 @llvm.amdgcn.workitem.id.x()
575 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
576 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
577 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
578 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>)
579 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
580
581 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
582 ret void
583}
584
585; GCN-LABEL: {{^}}v_clamp_v2f16_not_one:
586; GFX9: v_pk_max_f16
587; GFX9: v_pk_min_f16
588define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
589 %tid = call i32 @llvm.amdgcn.workitem.id.x()
590 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
591 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
592 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
593 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>)
594 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>)
595
596 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
597 ret void
598}
599
600; GCN-LABEL: {{^}}v_clamp_neg_v2f16:
601; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
602; GFX9-NOT: [[A]]
603; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
604define amdgpu_kernel void @v_clamp_neg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
605 %tid = call i32 @llvm.amdgcn.workitem.id.x()
606 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
607 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
608 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
609 %fneg.a = fsub <2 x half> <half -0.0, half -0.0>, %a
610 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer)
611 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
612
613 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
614 ret void
615}
616
617; GCN-LABEL: {{^}}v_clamp_negabs_v2f16:
618; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
619; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, [[A]]
620; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
621define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
622 %tid = call i32 @llvm.amdgcn.workitem.id.x()
623 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
624 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
625 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
626 %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
627 %fneg.fabs.a = fsub <2 x half> <half -0.0, half -0.0>, %fabs.a
628
629 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer)
630 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
631
632 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
633 ret void
634}
635
636; GCN-LABEL: {{^}}v_clamp_neglo_v2f16:
637; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
638; GFX9-NOT: [[A]]
639; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] clamp{{$}}
640define amdgpu_kernel void @v_clamp_neglo_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
641 %tid = call i32 @llvm.amdgcn.workitem.id.x()
642 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
643 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
644 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
645 %lo = extractelement <2 x half> %a, i32 0
646 %neg.lo = fsub half -0.0, %lo
647 %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0
648 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer)
649 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
650
651 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
652 ret void
653}
654
655; GCN-LABEL: {{^}}v_clamp_neghi_v2f16:
656; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
657; GFX9-NOT: [[A]]
658; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_hi:[1,1] clamp{{$}}
659define amdgpu_kernel void @v_clamp_neghi_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
660 %tid = call i32 @llvm.amdgcn.workitem.id.x()
661 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
662 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
663 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
664 %hi = extractelement <2 x half> %a, i32 1
665 %neg.hi = fsub half -0.0, %hi
666 %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1
667 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer)
668 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
669
670 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
671 ret void
672}
673
674; GCN-LABEL: {{^}}v_clamp_v2f16_shuffle:
675; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
676; GFX9-NOT: [[A]]
677; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] op_sel:[1,1] op_sel_hi:[0,0] clamp{{$}}
678define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
679 %tid = call i32 @llvm.amdgcn.workitem.id.x()
680 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
681 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
682 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
683 %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
684 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
685 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
686
687 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
688 ret void
689}
690
Matt Arsenaultb5acec12018-08-12 08:42:54 +0000691; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts0:
692; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
693; GFX9-NOT: [[A]]
694; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
695define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
696 %tid = call i32 @llvm.amdgcn.workitem.id.x()
697 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
698 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
699 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
700 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
701 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
702
703 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
704 ret void
705}
706
707; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts1:
708; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
709; GFX9-NOT: [[A]]
710; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
711define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
712 %tid = call i32 @llvm.amdgcn.workitem.id.x()
713 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
714 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
715 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
716 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
717 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
718
719 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
720 ret void
721}
722
Matt Arsenaultaafff872017-10-05 00:13:17 +0000723; GCN-LABEL: {{^}}v_clamp_diff_source_f32:
724; GCN: v_add_f32_e32 [[A:v[0-9]+]]
725; GCN: v_add_f32_e32 [[B:v[0-9]+]]
726; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[B]] clamp{{$}}
727define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0
728{
729 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 0
730 %gep1 = getelementptr float, float addrspace(1)* %aptr, i32 1
731 %gep2 = getelementptr float, float addrspace(1)* %aptr, i32 2
732 %l0 = load float, float addrspace(1)* %gep0
733 %l1 = load float, float addrspace(1)* %gep1
734 %l2 = load float, float addrspace(1)* %gep2
735 %a = fadd nsz float %l0, %l1
736 %b = fadd nsz float %l0, %l2
737 %res = call nsz float @llvm.maxnum.f32(float %a, float %b)
738 %max = call nsz float @llvm.maxnum.f32(float %res, float 0.0)
739 %min = call nsz float @llvm.minnum.f32(float %max, float 1.0)
740 %out.gep = getelementptr float, float addrspace(1)* %out, i32 3
741 store float %min, float addrspace(1)* %out.gep
742 ret void
743}
744
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000745declare i32 @llvm.amdgcn.workitem.id.x() #1
746declare float @llvm.fabs.f32(float) #1
747declare float @llvm.minnum.f32(float, float) #1
748declare float @llvm.maxnum.f32(float, float) #1
749declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
750declare double @llvm.fabs.f64(double) #1
751declare double @llvm.minnum.f64(double, double) #1
752declare double @llvm.maxnum.f64(double, double) #1
753declare half @llvm.fabs.f16(half) #1
754declare half @llvm.minnum.f16(half, half) #1
755declare half @llvm.maxnum.f16(half, half) #1
Matt Arsenault6b114d22017-08-30 01:20:17 +0000756declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
757declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
758declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
Matt Arsenault2fdf2a12017-02-21 23:35:48 +0000759
760attributes #0 = { nounwind }
761attributes #1 = { nounwind readnone }
762attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" }
763attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
764attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }