Blame - llvm/test/CodeGen/AMDGPU/clamp-modifier.ll - toolchain/llvm-project

blob: f405984c574ec2e6ce580e4354bfa53512fb0e9e [file] [log] [blame]

Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	1	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
				2	; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s
				3	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	4
				5	; GCN-LABEL: {{^}}v_clamp_add_src_f32:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	6	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	7	; GCN-NOT: [[A]]
				8	; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
				9	define amdgpu_kernel void @v_clamp_add_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				10	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				11	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				12	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				13	%a = load float, float addrspace(1)* %gep0
				14	%add = fadd float %a, 1.0
				15	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				16	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				17	store float %clamp, float addrspace(1)* %out.gep
				18	ret void
				19	}
				20
				21	; GCN-LABEL: {{^}}v_clamp_multi_use_src_f32:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	22	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	23	; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
				24	; GCN: v_max_f32_e64 v{{[0-9]+}}, [[ADD]], [[ADD]] clamp{{$}}
				25	define amdgpu_kernel void @v_clamp_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				26	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				27	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				28	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				29	%a = load float, float addrspace(1)* %gep0
				30	%add = fadd float %a, 1.0
				31	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				32	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				33	store float %clamp, float addrspace(1)* %out.gep
				34	store volatile float %add, float addrspace(1)* undef
				35	ret void
				36	}
				37
				38	; GCN-LABEL: {{^}}v_clamp_dbg_use_src_f32:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	39	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	40	; GCN-NOT: [[A]]
				41	; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
				42	define amdgpu_kernel void @v_clamp_dbg_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				43	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				44	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				45	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				46	%a = load float, float addrspace(1)* %gep0
				47	%add = fadd float %a, 1.0
				48	call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
				49	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				50	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				51	store float %clamp, float addrspace(1)* %out.gep
				52	ret void
				53	}
				54
				55	; GCN-LABEL: {{^}}v_clamp_add_neg_src_f32:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	56	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	57	; GCN: v_floor_f32_e32 [[FLOOR:v[0-9]+]], [[A]]
				58	; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[FLOOR]], -[[FLOOR]] clamp{{$}}
				59	define amdgpu_kernel void @v_clamp_add_neg_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				60	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				61	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				62	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				63	%a = load float, float addrspace(1)* %gep0
				64	%floor = call float @llvm.floor.f32(float %a)
				65	%neg.floor = fsub float -0.0, %floor
				66	%max = call float @llvm.maxnum.f32(float %neg.floor, float 0.0)
				67	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				68	store float %clamp, float addrspace(1)* %out.gep
				69	ret void
				70	}
				71
				72	; GCN-LABEL: {{^}}v_non_clamp_max_f32:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	73	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	74	; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
				75	; GCN: v_max_f32_e32 v{{[0-9]+}}, 0, [[ADD]]{{$}}
				76	define amdgpu_kernel void @v_non_clamp_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				77	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				78	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				79	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				80	%a = load float, float addrspace(1)* %gep0
				81	%add = fadd float %a, 1.0
				82	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				83	store float %max, float addrspace(1)* %out.gep
				84	ret void
				85	}
				86
				87	; GCN-LABEL: {{^}}v_clamp_add_src_f32_denormals:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	88	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	89	; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}}
				90	define amdgpu_kernel void @v_clamp_add_src_f32_denormals(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
				91	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				92	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				93	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				94	%a = load float, float addrspace(1)* %gep0
				95	%add = fadd float %a, 1.0
				96	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				97	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				98	store float %clamp, float addrspace(1)* %out.gep
				99	ret void
				100	}
				101
				102	; GCN-LABEL: {{^}}v_clamp_add_src_f16_denorm:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	103	; GCN: {{buffer\|flat\|global}}_load_ushort [[A:v[0-9]+]]
				104	; GFX89: v_add_f16_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}}
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	105
				106	; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
				107	; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}}
				108	; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]]
				109	define amdgpu_kernel void @v_clamp_add_src_f16_denorm(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
				110	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				111	%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
				112	%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
				113	%a = load half, half addrspace(1)* %gep0
				114	%add = fadd half %a, 1.0
				115	%max = call half @llvm.maxnum.f16(half %add, half 0.0)
				116	%clamp = call half @llvm.minnum.f16(half %max, half 1.0)
				117	store half %clamp, half addrspace(1)* %out.gep
				118	ret void
				119	}
				120
				121	; GCN-LABEL: {{^}}v_clamp_add_src_f16_no_denormals:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	122	; GCN: {{buffer\|flat\|global}}_load_ushort [[A:v[0-9]+]]
				123	; GFX89-NOT: [[A]]
				124	; GFX89: v_add_f16_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	125
				126	; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
				127	; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}}
				128	; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]]
				129	define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(half addrspace(1)* %out, half addrspace(1)* %aptr) #3 {
				130	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				131	%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
				132	%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
				133	%a = load half, half addrspace(1)* %gep0
				134	%add = fadd half %a, 1.0
				135	%max = call half @llvm.maxnum.f16(half %add, half 0.0)
				136	%clamp = call half @llvm.minnum.f16(half %max, half 1.0)
				137	store half %clamp, half addrspace(1)* %out.gep
				138	ret void
				139	}
				140
				141	; GCN-LABEL: {{^}}v_clamp_add_src_v2f32:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	142	; GCN: {{buffer\|flat\|global}}_load_dwordx2 v{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	143	; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[A]], 1.0 clamp{{$}}
				144	; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[B]], 1.0 clamp{{$}}
				145	define amdgpu_kernel void @v_clamp_add_src_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %aptr) #0 {
				146	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				147	%gep0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %aptr, i32 %tid
				148	%out.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %out, i32 %tid
				149	%a = load <2 x float>, <2 x float> addrspace(1)* %gep0
				150	%add = fadd <2 x float> %a, <float 1.0, float 1.0>
				151	%max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %add, <2 x float> zeroinitializer)
				152	%clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)
				153	store <2 x float> %clamp, <2 x float> addrspace(1)* %out.gep
				154	ret void
				155	}
				156
Matt Arsenault	79a45db	2017-02-22 23:53:37 +0000	[diff] [blame]	157	; GCN-LABEL: {{^}}v_clamp_add_src_f64:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	158	; GCN: {{buffer\|flat\|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
Matt Arsenault	79a45db	2017-02-22 23:53:37 +0000	[diff] [blame]	159	; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], 1.0 clamp{{$}}
				160	define amdgpu_kernel void @v_clamp_add_src_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
				161	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				162	%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
				163	%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
				164	%a = load double, double addrspace(1)* %gep0
				165	%add = fadd double %a, 1.0
				166	%max = call double @llvm.maxnum.f64(double %add, double 0.0)
				167	%clamp = call double @llvm.minnum.f64(double %max, double 1.0)
				168	store double %clamp, double addrspace(1)* %out.gep
				169	ret void
				170	}
				171
Matt Arsenault	3cb9ff8	2017-03-11 05:40:40 +0000	[diff] [blame]	172	; GCN-LABEL: {{^}}v_clamp_mac_to_mad:
				173	; GCN: v_mad_f32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]}} clamp{{$}}
				174	define amdgpu_kernel void @v_clamp_mac_to_mad(float addrspace(1)* %out, float addrspace(1)* %aptr, float %a) #0 {
				175	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				176	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				177	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				178	%b = load float, float addrspace(1)* %gep0
				179
				180	%mul = fmul float %a, %a
				181	%add = fadd float %mul, %b
				182	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				183	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				184	%res = fadd float %clamp, %b
				185	store float %res, float addrspace(1)* %out.gep
				186	ret void
				187	}
				188
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	189
				190	; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_denorm:
Stanislav Mekhanoshin	8b20b7d	2018-04-17 23:09:05 +0000	[diff] [blame]	191	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				192	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], 1.0 op_sel_hi:[1,0] clamp{{$}}
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	193	define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				194	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				195	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				196	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				197	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				198	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				199	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer)
				200	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				201	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				202	ret void
				203	}
				204
				205	; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_no_denormals:
Stanislav Mekhanoshin	8b20b7d	2018-04-17 23:09:05 +0000	[diff] [blame]	206	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				207	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], 1.0 op_sel_hi:[1,0] clamp{{$}}
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	208	define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #3 {
				209	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				210	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				211	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				212	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				213	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				214	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer)
				215	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				216	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				217	ret void
				218	}
				219
				220	; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_denorm_neg:
Stanislav Mekhanoshin	8b20b7d	2018-04-17 23:09:05 +0000	[diff] [blame]	221	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				222	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], 1.0 op_sel_hi:[1,0]{{$}}
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	223	; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], [[ADD]], [[ADD]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
				224	define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				225	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				226	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				227	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				228	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				229	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				230	%neg.add = fsub <2 x half> <half -0.0, half -0.0>, %add
				231	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.add, <2 x half> zeroinitializer)
				232	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				233	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				234	ret void
				235	}
				236
				237	; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_denorm_neg_lo:
Stanislav Mekhanoshin	8b20b7d	2018-04-17 23:09:05 +0000	[diff] [blame]	238	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				239	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], 1.0 op_sel_hi:[1,0]{{$}}
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	240	; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], [[ADD]], [[ADD]] neg_lo:[1,1] clamp{{$}}
				241	define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				242	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				243	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				244	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				245	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				246	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				247	%lo = extractelement <2 x half> %add, i32 0
				248	%neg.lo = fsub half -0.0, %lo
				249	%neg.lo.add = insertelement <2 x half> %add, half %neg.lo, i32 0
				250	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.add, <2 x half> zeroinitializer)
				251	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				252	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				253	ret void
				254	}
				255
				256	; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_denorm_neg_hi:
Stanislav Mekhanoshin	8b20b7d	2018-04-17 23:09:05 +0000	[diff] [blame]	257	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				258	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], 1.0 op_sel_hi:[1,0]{{$}}
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	259	; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], [[ADD]], [[ADD]] neg_hi:[1,1] clamp{{$}}
				260	define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				261	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				262	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				263	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				264	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				265	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				266	%hi = extractelement <2 x half> %add, i32 1
				267	%neg.hi = fsub half -0.0, %hi
				268	%neg.hi.add = insertelement <2 x half> %add, half %neg.hi, i32 1
				269	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.add, <2 x half> zeroinitializer)
				270	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				271	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				272	ret void
				273	}
				274
				275	; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_denorm_shuf:
Stanislav Mekhanoshin	8b20b7d	2018-04-17 23:09:05 +0000	[diff] [blame]	276	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				277	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], 1.0 op_sel_hi:[1,0]{{$}}
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	278	; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], [[ADD]], [[ADD]] op_sel:[1,1] op_sel_hi:[0,0] clamp{{$}}
				279	define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				280	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				281	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				282	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				283	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				284	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				285	%shuf = shufflevector <2 x half> %add, <2 x half> undef, <2 x i32> <i32 1, i32 0>
				286
				287	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
				288	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				289	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				290	ret void
				291	}
				292
				293	; GCN-LABEL: {{^}}v_no_clamp_add_src_v2f16_f32_src:
				294	; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				295	; GFX9: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
				296	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ADD]], [[ADD]] clamp{{$}}
				297	define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				298	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				299	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				300	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				301	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				302	%bc = bitcast <2 x half> %a to float
				303	%f32.op = fadd float %bc, 1.0
				304	%f32.op.cast = bitcast float %f32.op to <2 x half>
				305	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %f32.op.cast, <2 x half> zeroinitializer)
				306	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				307	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				308	ret void
				309	}
				310
				311	; GCN-LABEL: {{^}}v_no_clamp_add_packed_src_f32:
Stanislav Mekhanoshin	8b20b7d	2018-04-17 23:09:05 +0000	[diff] [blame]	312	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				313	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], 1.0 op_sel_hi:[1,0]{{$}}
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	314	; GFX9: v_max_f32_e64 [[CLAMP:v[0-9]+]], [[ADD]], [[ADD]] clamp{{$}}
				315	define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				316	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				317	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				318	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				319	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				320	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				321	%bc.add = bitcast <2 x half> %add to float
				322	%max = call float @llvm.maxnum.f32(float %bc.add, float 0.0)
				323	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				324	store float %clamp, float addrspace(1)* %out.gep
				325	ret void
				326	}
				327
				328	; Since the high bits are zeroed, it probably would be OK in this case
				329	; to use clamp.
				330	; GCN-LABEL: {{^}}v_no_clamp_add_src_v2f16_f16_src:
				331	; GCN-DAG: {{buffer\|flat\|global}}_load_ushort [[A:v[0-9]+]]
				332	; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
				333	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ADD]], [[ADD]] clamp{{$}}
				334	define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(<2 x half> addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
				335	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				336	%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
				337	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				338	%a = load half, half addrspace(1)* %gep0
				339	%add = fadd half %a, 1.0
				340	%bc = bitcast half %add to i16
				341	%zext = zext i16 %bc to i32
				342	%v2f16 = bitcast i32 %zext to <2 x half>
				343	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %v2f16, <2 x half> zeroinitializer)
				344	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				345	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				346	ret void
				347	}
				348
Matt Arsenault	709374d	2018-08-01 20:13:58 +0000	[diff] [blame^]	349	; FIXME: Worse code pre-gfx9
				350
				351	; GCN-LABEL: {{^}}v_clamp_cvt_pkrtz_src_v2f16_denorm:
				352	; GFX9: s_waitcnt
				353	; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 clamp{{$}}
				354	; GFX9-NEXT: s_setpc_b64
				355
				356	; VI: v_cvt_pkrtz_f16_f32 v0, v0, v1{{$}}
				357	; VI: v_max_f16_sdwa
				358	; VI: v_max_f16_e64
				359	; VI: v_or_b32
				360
				361	; SI: v_cvt_pkrtz_f16_f32_e32 v0, v0, v1{{$}}
				362	; SI-DAG: v_cvt_f32_f16_e64 v0, v0 clamp
				363	; SI-DAG: v_cvt_f32_f16_e64 v1, v1 clamp
				364	define <2 x half> @v_clamp_cvt_pkrtz_src_v2f16_denorm(float %a, float %b) #0 {
				365	%add = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
				366	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer)
				367	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				368	ret <2 x half> %clamp
				369	}
				370
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	371	declare i32 @llvm.amdgcn.workitem.id.x() #1
				372	declare float @llvm.fabs.f32(float) #1
				373	declare float @llvm.floor.f32(float) #1
				374	declare float @llvm.minnum.f32(float, float) #1
				375	declare float @llvm.maxnum.f32(float, float) #1
				376	declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
				377	declare double @llvm.fabs.f64(double) #1
				378	declare double @llvm.minnum.f64(double, double) #1
				379	declare double @llvm.maxnum.f64(double, double) #1
				380	declare half @llvm.fabs.f16(half) #1
				381	declare half @llvm.minnum.f16(half, half) #1
				382	declare half @llvm.maxnum.f16(half, half) #1
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	383	declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
				384	declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	385	declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1
				386	declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1
Matt Arsenault	709374d	2018-08-01 20:13:58 +0000	[diff] [blame^]	387	declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	388
				389
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	390	declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
				391
				392	attributes #0 = { nounwind }
				393	attributes #1 = { nounwind readnone }
				394	attributes #2 = { nounwind "target-features"="+fp32-denormals" }
				395	attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" }
				396
				397	!llvm.dbg.cu = !{!0}
				398	!llvm.module.flags = !{!2, !3}
				399
				400	!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
				401	!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
				402	!2 = !{i32 2, !"Dwarf Version", i32 4}
				403	!3 = !{i32 2, !"Debug Info Version", i32 3}
				404	!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
				405	!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
				406	!6 = !DISubroutineType(types: !7)
				407	!7 = !{null, !8}
				408	!8 = !DIBasicType(name: "float", size: 32, align: 32)
				409	!9 = !DIExpression()
				410	!10 = !DILocation(line: 1, column: 42, scope: !5)