Blame - llvm/test/CodeGen/AMDGPU/clamp.ll - toolchain/llvm-project

blob: d98b56062cd53119e00f6bbd0fd66cccadc697f7 [file] [log] [blame]

Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	1	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
				2	; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI %s
				3	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	4
				5	; GCN-LABEL: {{^}}v_clamp_f32:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	6	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	7	; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
				8	define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				9	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				10	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				11	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				12	%a = load float, float addrspace(1)* %gep0
				13	%max = call float @llvm.maxnum.f32(float %a, float 0.0)
				14	%med = call float @llvm.minnum.f32(float %max, float 1.0)
				15
				16	store float %med, float addrspace(1)* %out.gep
				17	ret void
				18	}
				19
				20	; GCN-LABEL: {{^}}v_clamp_neg_f32:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	21	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	22	; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
				23	define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				24	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				25	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				26	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				27	%a = load float, float addrspace(1)* %gep0
				28	%fneg.a = fsub float -0.0, %a
				29	%max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
				30	%med = call float @llvm.minnum.f32(float %max, float 1.0)
				31
				32	store float %med, float addrspace(1)* %out.gep
				33	ret void
				34	}
				35
				36	; GCN-LABEL: {{^}}v_clamp_negabs_f32:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	37	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	38	; GCN: v_max_f32_e64 v{{[0-9]+}}, -\|[[A]]\|, -\|[[A]]\| clamp{{$}}
				39	define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				40	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				41	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				42	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				43	%a = load float, float addrspace(1)* %gep0
				44	%fabs.a = call float @llvm.fabs.f32(float %a)
				45	%fneg.fabs.a = fsub float -0.0, %fabs.a
				46
				47	%max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
				48	%med = call float @llvm.minnum.f32(float %max, float 1.0)
				49
				50	store float %med, float addrspace(1)* %out.gep
				51	ret void
				52	}
				53
				54	; GCN-LABEL: {{^}}v_clamp_negzero_f32:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	55	; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	c3dc8e6	2018-08-03 18:27:52 +0000	[diff] [blame]	56	; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]]
Matt Arsenault	ebf4614	2018-09-18 02:34:54 +0000	[diff] [blame]	57	; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[ADD]]
				58	; GCN: v_min_f32_e32 v{{[0-9]+}}, 1.0, [[MAX]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	59	define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				60	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				61	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				62	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				63	%a = load float, float addrspace(1)* %gep0
Matt Arsenault	c3dc8e6	2018-08-03 18:27:52 +0000	[diff] [blame]	64	%add = fadd nnan float %a, 0.5
				65	%max = call float @llvm.maxnum.f32(float %add, float -0.0)
				66	%med = call float @llvm.minnum.f32(float %max, float 1.0)
				67
				68	store float %med, float addrspace(1)* %out.gep
				69	ret void
				70	}
				71
				72	; FIXME: Weird inconsistency in how -0.0 is treated. Accepted if clamp
				73	; matched through med3, not if directly. Is this correct?
				74
				75	; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32:
				76	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	687ec75	2018-10-22 16:27:27 +0000	[diff] [blame^]	77	; GCN: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
				78	; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[QUIET]]
Matt Arsenault	c3dc8e6	2018-08-03 18:27:52 +0000	[diff] [blame]	79	; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
				80	define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				81	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				82	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				83	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				84	%a = load float, float addrspace(1)* %gep0
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	85	%max = call float @llvm.maxnum.f32(float %a, float -0.0)
				86	%med = call float @llvm.minnum.f32(float %max, float 1.0)
				87
				88	store float %med, float addrspace(1)* %out.gep
				89	ret void
				90	}
				91
				92	; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	93	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	687ec75	2018-10-22 16:27:27 +0000	[diff] [blame^]	94	; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
				95	; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
				96	; GCN: v_min_f32_e32 [[MED:v[0-9]+]], 1.0, [[QUIET_A]]
				97	; GCN-NOT: [[MAX]]
				98	; GCN-NOT: [[MED]]
				99
				100	; SI: buffer_store_dword [[MED]]
				101	; SI: buffer_store_dword [[MAX]]
				102
				103	; GFX89: {{flat\|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MED]]
				104	; GFX89: {{flat\|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	105	define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				106	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				107	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				108	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				109	%a = load float, float addrspace(1)* %gep0
				110	%max = call float @llvm.maxnum.f32(float %a, float 0.0)
				111	%med = call float @llvm.minnum.f32(float %max, float 1.0)
				112
				113	store float %med, float addrspace(1)* %out.gep
				114	store volatile float %max, float addrspace(1)* undef
				115	ret void
				116	}
				117
				118	; GCN-LABEL: {{^}}v_clamp_f16:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	119	; GCN: {{buffer\|flat\|global}}_load_ushort [[A:v[0-9]+]]
				120	; GFX89: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	121
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	122	; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}}
				123	; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	124	define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
				125	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				126	%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
				127	%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
				128	%a = load half, half addrspace(1)* %gep0
				129	%max = call half @llvm.maxnum.f16(half %a, half 0.0)
				130	%med = call half @llvm.minnum.f16(half %max, half 1.0)
				131
				132	store half %med, half addrspace(1)* %out.gep
				133	ret void
				134	}
				135
				136	; GCN-LABEL: {{^}}v_clamp_neg_f16:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	137	; GCN: {{buffer\|flat\|global}}_load_ushort [[A:v[0-9]+]]
				138	; GFX89: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	139
				140	; FIXME: Better to fold neg into max
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	141	; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}}
				142	; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	143	define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
				144	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				145	%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
				146	%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
				147	%a = load half, half addrspace(1)* %gep0
				148	%fneg.a = fsub half -0.0, %a
				149	%max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
				150	%med = call half @llvm.minnum.f16(half %max, half 1.0)
				151
				152	store half %med, half addrspace(1)* %out.gep
				153	ret void
				154	}
				155
				156	; GCN-LABEL: {{^}}v_clamp_negabs_f16:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	157	; GCN: {{buffer\|flat\|global}}_load_ushort [[A:v[0-9]+]]
				158	; GFX89: v_max_f16_e64 v{{[0-9]+}}, -\|[[A]]\|, -\|[[A]]\| clamp{{$}}
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	159
				160	; FIXME: Better to fold neg/abs into max
				161
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	162	; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -\|[[A]]\| clamp{{$}}
				163	; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	164	define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
				165	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				166	%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
				167	%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
				168	%a = load half, half addrspace(1)* %gep0
				169	%fabs.a = call half @llvm.fabs.f16(half %a)
				170	%fneg.fabs.a = fsub half -0.0, %fabs.a
				171
				172	%max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
				173	%med = call half @llvm.minnum.f16(half %max, half 1.0)
				174
				175	store half %med, half addrspace(1)* %out.gep
				176	ret void
				177	}
				178
				179	; FIXME: Do f64 instructions support clamp?
				180	; GCN-LABEL: {{^}}v_clamp_f64:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	181	; GCN: {{buffer\|flat\|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
Matt Arsenault	79a45db	2017-02-22 23:53:37 +0000	[diff] [blame]	182	; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}}
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	183	define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
				184	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				185	%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
				186	%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
				187	%a = load double, double addrspace(1)* %gep0
				188	%max = call double @llvm.maxnum.f64(double %a, double 0.0)
				189	%med = call double @llvm.minnum.f64(double %max, double 1.0)
				190
				191	store double %med, double addrspace(1)* %out.gep
				192	ret void
				193	}
				194
				195	; GCN-LABEL: {{^}}v_clamp_neg_f64:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	196	; GCN: {{buffer\|flat\|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
Matt Arsenault	79a45db	2017-02-22 23:53:37 +0000	[diff] [blame]	197	; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}}
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	198	define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
				199	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				200	%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
				201	%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
				202	%a = load double, double addrspace(1)* %gep0
				203	%fneg.a = fsub double -0.0, %a
				204	%max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
				205	%med = call double @llvm.minnum.f64(double %max, double 1.0)
				206
				207	store double %med, double addrspace(1)* %out.gep
				208	ret void
				209	}
				210
				211	; GCN-LABEL: {{^}}v_clamp_negabs_f64:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	212	; GCN: {{buffer\|flat\|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
Matt Arsenault	79a45db	2017-02-22 23:53:37 +0000	[diff] [blame]	213	; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -\|[[A]]\|, -\|[[A]]\| clamp{{$}}
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	214	define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
				215	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				216	%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
				217	%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
				218	%a = load double, double addrspace(1)* %gep0
				219	%fabs.a = call double @llvm.fabs.f64(double %a)
				220	%fneg.fabs.a = fsub double -0.0, %fabs.a
				221
				222	%max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
				223	%med = call double @llvm.minnum.f64(double %max, double 1.0)
				224
				225	store double %med, double addrspace(1)* %out.gep
				226	ret void
				227	}
				228
				229	; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	230	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	231	; GCN: v_med3_f32
				232	define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				233	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				234	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				235	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				236	%a = load float, float addrspace(1)* %gep0
				237	%med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
				238	store float %med, float addrspace(1)* %out.gep
				239	ret void
				240	}
				241
				242	; GCN-LABEL: {{^}}v_clamp_med3_aby_f32:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	243	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	244	; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
				245	define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				246	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				247	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				248	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				249	%a = load float, float addrspace(1)* %gep0
				250	%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
				251	store float %med, float addrspace(1)* %out.gep
				252	ret void
				253	}
				254
				255	; GCN-LABEL: {{^}}v_clamp_med3_bay_f32:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	256	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	257	; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
				258	define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				259	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				260	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				261	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				262	%a = load float, float addrspace(1)* %gep0
				263	%med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
				264	store float %med, float addrspace(1)* %out.gep
				265	ret void
				266	}
				267
				268	; GCN-LABEL: {{^}}v_clamp_med3_yab_f32:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	269	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	270	; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
				271	define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				272	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				273	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				274	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				275	%a = load float, float addrspace(1)* %gep0
				276	%med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
				277	store float %med, float addrspace(1)* %out.gep
				278	ret void
				279	}
				280
				281	; GCN-LABEL: {{^}}v_clamp_med3_yba_f32:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	282	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	283	; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
				284	define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				285	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				286	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				287	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				288	%a = load float, float addrspace(1)* %gep0
				289	%med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
				290	store float %med, float addrspace(1)* %out.gep
				291	ret void
				292	}
				293
				294	; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	295	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	296	; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
				297	define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				298	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				299	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				300	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				301	%a = load float, float addrspace(1)* %gep0
				302	%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
				303	store float %med, float addrspace(1)* %out.gep
				304	ret void
				305	}
				306
				307	; GCN-LABEL: {{^}}v_clamp_med3_bya_f32:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	308	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	309	; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
				310	define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				311	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				312	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				313	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				314	%a = load float, float addrspace(1)* %gep0
				315	%med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
				316	store float %med, float addrspace(1)* %out.gep
				317	ret void
				318	}
				319
				320	; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32:
				321	; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0
				322	define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 {
				323	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				324	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				325	%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
				326	store float %med, float addrspace(1)* %out.gep
				327	ret void
				328	}
				329
				330	; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32:
				331	; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
				332	define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 {
				333	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				334	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				335	%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
				336	store float %med, float addrspace(1)* %out.gep
				337	ret void
				338	}
				339
				340	; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32:
				341	; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5
				342	define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 {
				343	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				344	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				345	%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
				346	store float %med, float addrspace(1)* %out.gep
				347	ret void
				348	}
				349
				350	; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32:
				351	; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}}
				352	define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 {
				353	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				354	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				355	%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
				356	store float %med, float addrspace(1)* %out.gep
				357	ret void
				358	}
				359
				360	; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32:
				361	; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
				362	define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 {
				363	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				364	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				365	%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
				366	store float %med, float addrspace(1)* %out.gep
				367	ret void
				368	}
				369
				370	; GCN-LABEL: {{^}}v_clamp_constant_snan_f32:
				371	; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
				372	define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 {
				373	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				374	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				375	%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
				376	store float %med, float addrspace(1)* %out.gep
				377	ret void
				378	}
				379
				380	; ---------------------------------------------------------------------
				381	; Test non-default behaviors enabling snans and disabling dx10_clamp
				382	; ---------------------------------------------------------------------
				383
				384	; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	385	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	c3dc8e6	2018-08-03 18:27:52 +0000	[diff] [blame]	386	; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]]
				387	; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	388	define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
				389	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				390	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				391	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				392	%a = load float, float addrspace(1)* %gep0
Matt Arsenault	c3dc8e6	2018-08-03 18:27:52 +0000	[diff] [blame]	393	%a.nnan = fadd nnan float %a, 0.5
				394	%max = call float @llvm.maxnum.f32(float %a.nnan, float 0.0)
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	395	%med = call float @llvm.minnum.f32(float %max, float 1.0)
				396
				397	store float %med, float addrspace(1)* %out.gep
				398	ret void
				399	}
				400
				401	; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	402	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	c3dc8e6	2018-08-03 18:27:52 +0000	[diff] [blame]	403	; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 0.5 clamp{{$}}
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	404	define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 {
				405	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				406	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				407	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				408	%a = load float, float addrspace(1)* %gep0
Matt Arsenault	c3dc8e6	2018-08-03 18:27:52 +0000	[diff] [blame]	409	%add = fadd float %a, 0.5
				410	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	411	%med = call float @llvm.minnum.f32(float %max, float 1.0)
				412
				413	store float %med, float addrspace(1)* %out.gep
				414	ret void
				415	}
				416
				417	; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	418	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	687ec75	2018-10-22 16:27:27 +0000	[diff] [blame^]	419	; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
				420	; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	421	define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
				422	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				423	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				424	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				425	%a = load float, float addrspace(1)* %gep0
				426	%max = call float @llvm.maxnum.f32(float %a, float 0.0)
				427	%med = call float @llvm.minnum.f32(float %max, float 1.0)
				428
				429	store float %med, float addrspace(1)* %out.gep
				430	ret void
				431	}
				432
				433	; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	434	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	301162c	2017-11-15 21:51:43 +0000	[diff] [blame]	435	; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]
				436	; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	437	define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
				438	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				439	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				440	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				441	%a = load float, float addrspace(1)* %gep0
				442	%add = fadd nnan float %a, 1.0
				443	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				444	%med = call float @llvm.minnum.f32(float %max, float 1.0)
				445
				446	store float %med, float addrspace(1)* %out.gep
				447	ret void
				448	}
				449
				450	; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	451	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	452	; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
				453	define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
				454	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				455	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				456	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				457	%a = load float, float addrspace(1)* %gep0
				458	%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
				459	store float %med, float addrspace(1)* %out.gep
				460	ret void
				461	}
				462
				463	; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	464	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	465	; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
				466	define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
				467	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				468	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				469	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				470	%a = load float, float addrspace(1)* %gep0
				471	%med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
				472	store float %med, float addrspace(1)* %out.gep
				473	ret void
				474	}
				475
				476	; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	477	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	478	; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
				479	define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
				480	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				481	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				482	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				483	%a = load float, float addrspace(1)* %gep0
				484	%med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
				485	store float %med, float addrspace(1)* %out.gep
				486	ret void
				487	}
				488
				489	; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	490	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	491	; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0
				492	define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
				493	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				494	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				495	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				496	%a = load float, float addrspace(1)* %gep0
				497	%med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
				498	store float %med, float addrspace(1)* %out.gep
				499	ret void
				500	}
				501
				502	; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	503	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	504	; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0
				505	define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
				506	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				507	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				508	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				509	%a = load float, float addrspace(1)* %gep0
				510	%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
				511	store float %med, float addrspace(1)* %out.gep
				512	ret void
				513	}
				514
				515	; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp:
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	516	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	517	; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0
				518	define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
				519	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				520	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				521	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				522	%a = load float, float addrspace(1)* %gep0
				523	%med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
				524	store float %med, float addrspace(1)* %out.gep
				525	ret void
				526	}
				527
				528	; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp:
				529	; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000
				530	define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
				531	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				532	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				533	%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
				534	store float %med, float addrspace(1)* %out.gep
				535	ret void
				536	}
				537
				538	; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp:
				539	; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001
				540	define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
				541	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				542	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				543	%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
				544	store float %med, float addrspace(1)* %out.gep
				545	ret void
				546	}
				547
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	548	; GCN-LABEL: {{^}}v_clamp_v2f16:
				549	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				550	; GFX9-NOT: [[A]]
				551	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
				552	define amdgpu_kernel void @v_clamp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				553	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				554	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				555	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				556	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				557	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer)
				558	%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				559
				560	store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
				561	ret void
				562	}
				563
				564	; GCN-LABEL: {{^}}v_clamp_v2f16_undef_elt:
				565	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				566	; GFX9-NOT: [[A]]
				567	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
				568	define amdgpu_kernel void @v_clamp_v2f16_undef_elt(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				569	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				570	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				571	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				572	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				573	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
				574	%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
				575
				576	store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
				577	ret void
				578	}
				579
				580	; GCN-LABEL: {{^}}v_clamp_v2f16_not_zero:
				581	; GFX9: v_pk_max_f16
				582	; GFX9: v_pk_min_f16
				583	define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				584	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				585	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				586	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				587	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				588	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>)
				589	%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				590
				591	store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
				592	ret void
				593	}
				594
				595	; GCN-LABEL: {{^}}v_clamp_v2f16_not_one:
				596	; GFX9: v_pk_max_f16
				597	; GFX9: v_pk_min_f16
				598	define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				599	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				600	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				601	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				602	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				603	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>)
				604	%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>)
				605
				606	store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
				607	ret void
				608	}
				609
				610	; GCN-LABEL: {{^}}v_clamp_neg_v2f16:
				611	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				612	; GFX9-NOT: [[A]]
				613	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
				614	define amdgpu_kernel void @v_clamp_neg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				615	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				616	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				617	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				618	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				619	%fneg.a = fsub <2 x half> <half -0.0, half -0.0>, %a
				620	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer)
				621	%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				622
				623	store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
				624	ret void
				625	}
				626
				627	; GCN-LABEL: {{^}}v_clamp_negabs_v2f16:
				628	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				629	; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, [[A]]
				630	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
				631	define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				632	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				633	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				634	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				635	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				636	%fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
				637	%fneg.fabs.a = fsub <2 x half> <half -0.0, half -0.0>, %fabs.a
				638
				639	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer)
				640	%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				641
				642	store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
				643	ret void
				644	}
				645
				646	; GCN-LABEL: {{^}}v_clamp_neglo_v2f16:
				647	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				648	; GFX9-NOT: [[A]]
				649	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] clamp{{$}}
				650	define amdgpu_kernel void @v_clamp_neglo_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				651	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				652	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				653	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				654	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				655	%lo = extractelement <2 x half> %a, i32 0
				656	%neg.lo = fsub half -0.0, %lo
				657	%neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0
				658	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer)
				659	%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				660
				661	store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
				662	ret void
				663	}
				664
				665	; GCN-LABEL: {{^}}v_clamp_neghi_v2f16:
				666	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				667	; GFX9-NOT: [[A]]
				668	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_hi:[1,1] clamp{{$}}
				669	define amdgpu_kernel void @v_clamp_neghi_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				670	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				671	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				672	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				673	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				674	%hi = extractelement <2 x half> %a, i32 1
				675	%neg.hi = fsub half -0.0, %hi
				676	%neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1
				677	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer)
				678	%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				679
				680	store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
				681	ret void
				682	}
				683
				684	; GCN-LABEL: {{^}}v_clamp_v2f16_shuffle:
				685	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				686	; GFX9-NOT: [[A]]
				687	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] op_sel:[1,1] op_sel_hi:[0,0] clamp{{$}}
				688	define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				689	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				690	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				691	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				692	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				693	%shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
				694	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
				695	%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				696
				697	store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
				698	ret void
				699	}
				700
Matt Arsenault	b5acec1	2018-08-12 08:42:54 +0000	[diff] [blame]	701	; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts0:
				702	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				703	; GFX9-NOT: [[A]]
				704	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
				705	define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				706	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				707	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				708	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				709	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				710	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
				711	%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
				712
				713	store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
				714	ret void
				715	}
				716
				717	; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts1:
				718	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				719	; GFX9-NOT: [[A]]
				720	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
				721	define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				722	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				723	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				724	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				725	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				726	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
				727	%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
				728
				729	store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
				730	ret void
				731	}
				732
Matt Arsenault	aafff87	2017-10-05 00:13:17 +0000	[diff] [blame]	733	; GCN-LABEL: {{^}}v_clamp_diff_source_f32:
				734	; GCN: v_add_f32_e32 [[A:v[0-9]+]]
				735	; GCN: v_add_f32_e32 [[B:v[0-9]+]]
				736	; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[B]] clamp{{$}}
				737	define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0
				738	{
				739	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 0
				740	%gep1 = getelementptr float, float addrspace(1)* %aptr, i32 1
				741	%gep2 = getelementptr float, float addrspace(1)* %aptr, i32 2
				742	%l0 = load float, float addrspace(1)* %gep0
				743	%l1 = load float, float addrspace(1)* %gep1
				744	%l2 = load float, float addrspace(1)* %gep2
				745	%a = fadd nsz float %l0, %l1
				746	%b = fadd nsz float %l0, %l2
				747	%res = call nsz float @llvm.maxnum.f32(float %a, float %b)
				748	%max = call nsz float @llvm.maxnum.f32(float %res, float 0.0)
				749	%min = call nsz float @llvm.minnum.f32(float %max, float 1.0)
				750	%out.gep = getelementptr float, float addrspace(1)* %out, i32 3
				751	store float %min, float addrspace(1)* %out.gep
				752	ret void
				753	}
				754
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	755	declare i32 @llvm.amdgcn.workitem.id.x() #1
				756	declare float @llvm.fabs.f32(float) #1
				757	declare float @llvm.minnum.f32(float, float) #1
				758	declare float @llvm.maxnum.f32(float, float) #1
				759	declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
				760	declare double @llvm.fabs.f64(double) #1
				761	declare double @llvm.minnum.f64(double, double) #1
				762	declare double @llvm.maxnum.f64(double, double) #1
				763	declare half @llvm.fabs.f16(half) #1
				764	declare half @llvm.minnum.f16(half, half) #1
				765	declare half @llvm.maxnum.f16(half, half) #1
Matt Arsenault	6b114d2	2017-08-30 01:20:17 +0000	[diff] [blame]	766	declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
				767	declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
				768	declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
Matt Arsenault	2fdf2a1	2017-02-21 23:35:48 +0000	[diff] [blame]	769
				770	attributes #0 = { nounwind }
				771	attributes #1 = { nounwind readnone }
				772	attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" }
				773	attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
				774	attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }