Blame - llvm/test/CodeGen/AMDGPU/clamp-modifier.ll - toolchain/llvm-project

blob: 9c137101c56d1debf484907c835b694785ab4b94 [file] [log] [blame]

Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	1	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
				2	; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s
				3	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	4
				5	; GCN-LABEL: {{^}}v_clamp_add_src_f32:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	6	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	7	; GCN-NOT: [[A]]
				8	; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
				9	define amdgpu_kernel void @v_clamp_add_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				10	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				11	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				12	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				13	%a = load float, float addrspace(1)* %gep0
				14	%add = fadd float %a, 1.0
				15	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				16	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				17	store float %clamp, float addrspace(1)* %out.gep
				18	ret void
				19	}
				20
				21	; GCN-LABEL: {{^}}v_clamp_multi_use_src_f32:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	22	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	23	; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
				24	; GCN: v_max_f32_e64 v{{[0-9]+}}, [[ADD]], [[ADD]] clamp{{$}}
				25	define amdgpu_kernel void @v_clamp_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				26	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				27	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				28	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				29	%a = load float, float addrspace(1)* %gep0
				30	%add = fadd float %a, 1.0
				31	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				32	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				33	store float %clamp, float addrspace(1)* %out.gep
				34	store volatile float %add, float addrspace(1)* undef
				35	ret void
				36	}
				37
				38	; GCN-LABEL: {{^}}v_clamp_dbg_use_src_f32:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	39	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	40	; GCN-NOT: [[A]]
				41	; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
				42	define amdgpu_kernel void @v_clamp_dbg_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				43	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				44	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				45	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				46	%a = load float, float addrspace(1)* %gep0
				47	%add = fadd float %a, 1.0
				48	call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
				49	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				50	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				51	store float %clamp, float addrspace(1)* %out.gep
				52	ret void
				53	}
				54
				55	; GCN-LABEL: {{^}}v_clamp_add_neg_src_f32:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	56	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	57	; GCN: v_floor_f32_e32 [[FLOOR:v[0-9]+]], [[A]]
				58	; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[FLOOR]], -[[FLOOR]] clamp{{$}}
				59	define amdgpu_kernel void @v_clamp_add_neg_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				60	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				61	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				62	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				63	%a = load float, float addrspace(1)* %gep0
				64	%floor = call float @llvm.floor.f32(float %a)
				65	%neg.floor = fsub float -0.0, %floor
				66	%max = call float @llvm.maxnum.f32(float %neg.floor, float 0.0)
				67	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				68	store float %clamp, float addrspace(1)* %out.gep
				69	ret void
				70	}
				71
				72	; GCN-LABEL: {{^}}v_non_clamp_max_f32:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	73	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	74	; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
				75	; GCN: v_max_f32_e32 v{{[0-9]+}}, 0, [[ADD]]{{$}}
				76	define amdgpu_kernel void @v_non_clamp_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				77	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				78	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				79	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				80	%a = load float, float addrspace(1)* %gep0
				81	%add = fadd float %a, 1.0
				82	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				83	store float %max, float addrspace(1)* %out.gep
				84	ret void
				85	}
				86
				87	; GCN-LABEL: {{^}}v_clamp_add_src_f32_denormals:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	88	; GCN: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	89	; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}}
				90	define amdgpu_kernel void @v_clamp_add_src_f32_denormals(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
				91	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				92	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				93	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				94	%a = load float, float addrspace(1)* %gep0
				95	%add = fadd float %a, 1.0
				96	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				97	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				98	store float %clamp, float addrspace(1)* %out.gep
				99	ret void
				100	}
				101
				102	; GCN-LABEL: {{^}}v_clamp_add_src_f16_denorm:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	103	; GCN: {{buffer\|flat\|global}}_load_ushort [[A:v[0-9]+]]
				104	; GFX89: v_add_f16_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}}
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	105
				106	; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
				107	; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}}
				108	; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]]
				109	define amdgpu_kernel void @v_clamp_add_src_f16_denorm(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
				110	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				111	%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
				112	%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
				113	%a = load half, half addrspace(1)* %gep0
				114	%add = fadd half %a, 1.0
				115	%max = call half @llvm.maxnum.f16(half %add, half 0.0)
				116	%clamp = call half @llvm.minnum.f16(half %max, half 1.0)
				117	store half %clamp, half addrspace(1)* %out.gep
				118	ret void
				119	}
				120
				121	; GCN-LABEL: {{^}}v_clamp_add_src_f16_no_denormals:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	122	; GCN: {{buffer\|flat\|global}}_load_ushort [[A:v[0-9]+]]
				123	; GFX89-NOT: [[A]]
				124	; GFX89: v_add_f16_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	125
				126	; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
				127	; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}}
				128	; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]]
				129	define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(half addrspace(1)* %out, half addrspace(1)* %aptr) #3 {
				130	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				131	%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
				132	%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
				133	%a = load half, half addrspace(1)* %gep0
				134	%add = fadd half %a, 1.0
				135	%max = call half @llvm.maxnum.f16(half %add, half 0.0)
				136	%clamp = call half @llvm.minnum.f16(half %max, half 1.0)
				137	store half %clamp, half addrspace(1)* %out.gep
				138	ret void
				139	}
				140
				141	; GCN-LABEL: {{^}}v_clamp_add_src_v2f32:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	142	; GCN: {{buffer\|flat\|global}}_load_dwordx2 v{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	143	; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[A]], 1.0 clamp{{$}}
				144	; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[B]], 1.0 clamp{{$}}
				145	define amdgpu_kernel void @v_clamp_add_src_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %aptr) #0 {
				146	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				147	%gep0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %aptr, i32 %tid
				148	%out.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %out, i32 %tid
				149	%a = load <2 x float>, <2 x float> addrspace(1)* %gep0
				150	%add = fadd <2 x float> %a, <float 1.0, float 1.0>
				151	%max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %add, <2 x float> zeroinitializer)
				152	%clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)
				153	store <2 x float> %clamp, <2 x float> addrspace(1)* %out.gep
				154	ret void
				155	}
				156
Matt Arsenault	79a45db	2017-02-22 23:53:37 +0000	[diff] [blame]	157	; GCN-LABEL: {{^}}v_clamp_add_src_f64:
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	158	; GCN: {{buffer\|flat\|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
Matt Arsenault	79a45db	2017-02-22 23:53:37 +0000	[diff] [blame]	159	; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], 1.0 clamp{{$}}
				160	define amdgpu_kernel void @v_clamp_add_src_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
				161	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				162	%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
				163	%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
				164	%a = load double, double addrspace(1)* %gep0
				165	%add = fadd double %a, 1.0
				166	%max = call double @llvm.maxnum.f64(double %add, double 0.0)
				167	%clamp = call double @llvm.minnum.f64(double %max, double 1.0)
				168	store double %clamp, double addrspace(1)* %out.gep
				169	ret void
				170	}
				171
Matt Arsenault	3cb9ff8	2017-03-11 05:40:40 +0000	[diff] [blame]	172	; GCN-LABEL: {{^}}v_clamp_mac_to_mad:
				173	; GCN: v_mad_f32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]}} clamp{{$}}
				174	define amdgpu_kernel void @v_clamp_mac_to_mad(float addrspace(1)* %out, float addrspace(1)* %aptr, float %a) #0 {
				175	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				176	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				177	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				178	%b = load float, float addrspace(1)* %gep0
				179
				180	%mul = fmul float %a, %a
				181	%add = fadd float %mul, %b
				182	%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				183	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				184	%res = fadd float %clamp, %b
				185	store float %res, float addrspace(1)* %out.gep
				186	ret void
				187	}
				188
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	189
				190	; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_denorm:
				191	; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				192	; GFX9-DAG: s_mov_b32 [[ONE:s[0-9]+]], 0x3c003c00
				193	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], [[ONE]] clamp{{$}}
				194	define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				195	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				196	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				197	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				198	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				199	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				200	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer)
				201	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				202	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				203	ret void
				204	}
				205
				206	; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_no_denormals:
				207	; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				208	; GFX9-DAG: s_mov_b32 [[ONE:s[0-9]+]], 0x3c003c00
				209	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], [[ONE]] clamp{{$}}
				210	define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #3 {
				211	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				212	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				213	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				214	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				215	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				216	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer)
				217	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				218	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				219	ret void
				220	}
				221
				222	; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_denorm_neg:
				223	; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				224	; GFX9-DAG: s_mov_b32 [[ONE:s[0-9]+]], 0x3c003c00
				225	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], [[ONE]]{{$}}
				226	; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], [[ADD]], [[ADD]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
				227	define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				228	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				229	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				230	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				231	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				232	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				233	%neg.add = fsub <2 x half> <half -0.0, half -0.0>, %add
				234	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.add, <2 x half> zeroinitializer)
				235	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				236	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				237	ret void
				238	}
				239
				240	; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_denorm_neg_lo:
				241	; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				242	; GFX9-DAG: s_mov_b32 [[ONE:s[0-9]+]], 0x3c003c00
				243	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], [[ONE]]{{$}}
				244	; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], [[ADD]], [[ADD]] neg_lo:[1,1] clamp{{$}}
				245	define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				246	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				247	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				248	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				249	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				250	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				251	%lo = extractelement <2 x half> %add, i32 0
				252	%neg.lo = fsub half -0.0, %lo
				253	%neg.lo.add = insertelement <2 x half> %add, half %neg.lo, i32 0
				254	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.add, <2 x half> zeroinitializer)
				255	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				256	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				257	ret void
				258	}
				259
				260	; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_denorm_neg_hi:
				261	; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				262	; GFX9-DAG: s_mov_b32 [[ONE:s[0-9]+]], 0x3c003c00
				263	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], [[ONE]]{{$}}
				264	; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], [[ADD]], [[ADD]] neg_hi:[1,1] clamp{{$}}
				265	define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				266	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				267	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				268	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				269	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				270	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				271	%hi = extractelement <2 x half> %add, i32 1
				272	%neg.hi = fsub half -0.0, %hi
				273	%neg.hi.add = insertelement <2 x half> %add, half %neg.hi, i32 1
				274	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.add, <2 x half> zeroinitializer)
				275	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				276	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				277	ret void
				278	}
				279
				280	; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_denorm_shuf:
				281	; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				282	; GFX9-DAG: s_mov_b32 [[ONE:s[0-9]+]], 0x3c003c00
				283	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], [[ONE]]{{$}}
				284	; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], [[ADD]], [[ADD]] op_sel:[1,1] op_sel_hi:[0,0] clamp{{$}}
				285	define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				286	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				287	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				288	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				289	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				290	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				291	%shuf = shufflevector <2 x half> %add, <2 x half> undef, <2 x i32> <i32 1, i32 0>
				292
				293	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
				294	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				295	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				296	ret void
				297	}
				298
				299	; GCN-LABEL: {{^}}v_no_clamp_add_src_v2f16_f32_src:
				300	; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				301	; GFX9: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
				302	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ADD]], [[ADD]] clamp{{$}}
				303	define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				304	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				305	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				306	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				307	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				308	%bc = bitcast <2 x half> %a to float
				309	%f32.op = fadd float %bc, 1.0
				310	%f32.op.cast = bitcast float %f32.op to <2 x half>
				311	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %f32.op.cast, <2 x half> zeroinitializer)
				312	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				313	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				314	ret void
				315	}
				316
				317	; GCN-LABEL: {{^}}v_no_clamp_add_packed_src_f32:
				318	; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[A:v[0-9]+]]
				319	; GFX9-DAG: s_mov_b32 [[ONE:s[0-9]+]], 0x3c003c00
				320	; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], [[ONE]]{{$}}
				321	; GFX9: v_max_f32_e64 [[CLAMP:v[0-9]+]], [[ADD]], [[ADD]] clamp{{$}}
				322	define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
				323	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				324	%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
				325	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				326	%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
				327	%add = fadd <2 x half> %a, <half 1.0, half 1.0>
				328	%bc.add = bitcast <2 x half> %add to float
				329	%max = call float @llvm.maxnum.f32(float %bc.add, float 0.0)
				330	%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				331	store float %clamp, float addrspace(1)* %out.gep
				332	ret void
				333	}
				334
				335	; Since the high bits are zeroed, it probably would be OK in this case
				336	; to use clamp.
				337	; GCN-LABEL: {{^}}v_no_clamp_add_src_v2f16_f16_src:
				338	; GCN-DAG: {{buffer\|flat\|global}}_load_ushort [[A:v[0-9]+]]
				339	; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
				340	; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ADD]], [[ADD]] clamp{{$}}
				341	define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(<2 x half> addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
				342	%tid = call i32 @llvm.amdgcn.workitem.id.x()
				343	%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
				344	%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
				345	%a = load half, half addrspace(1)* %gep0
				346	%add = fadd half %a, 1.0
				347	%bc = bitcast half %add to i16
				348	%zext = zext i16 %bc to i32
				349	%v2f16 = bitcast i32 %zext to <2 x half>
				350	%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %v2f16, <2 x half> zeroinitializer)
				351	%clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
				352	store <2 x half> %clamp, <2 x half> addrspace(1)* %out.gep
				353	ret void
				354	}
				355
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	356	declare i32 @llvm.amdgcn.workitem.id.x() #1
				357	declare float @llvm.fabs.f32(float) #1
				358	declare float @llvm.floor.f32(float) #1
				359	declare float @llvm.minnum.f32(float, float) #1
				360	declare float @llvm.maxnum.f32(float, float) #1
				361	declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
				362	declare double @llvm.fabs.f64(double) #1
				363	declare double @llvm.minnum.f64(double, double) #1
				364	declare double @llvm.maxnum.f64(double, double) #1
				365	declare half @llvm.fabs.f16(half) #1
				366	declare half @llvm.minnum.f16(half, half) #1
				367	declare half @llvm.maxnum.f16(half, half) #1
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	368	declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
				369	declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	370	declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1
				371	declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1
Matt Arsenault	ab4a5cd	2017-08-31 23:53:50 +0000	[diff] [blame]	372
				373
Matt Arsenault	d5c6515	2017-02-22 23:27:53 +0000	[diff] [blame]	374	declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
				375
				376	attributes #0 = { nounwind }
				377	attributes #1 = { nounwind readnone }
				378	attributes #2 = { nounwind "target-features"="+fp32-denormals" }
				379	attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" }
				380
				381	!llvm.dbg.cu = !{!0}
				382	!llvm.module.flags = !{!2, !3}
				383
				384	!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
				385	!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
				386	!2 = !{i32 2, !"Dwarf Version", i32 4}
				387	!3 = !{i32 2, !"Debug Info Version", i32 3}
				388	!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
				389	!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
				390	!6 = !DISubroutineType(types: !7)
				391	!7 = !{null, !8}
				392	!8 = !DIBasicType(name: "float", size: 32, align: 32)
				393	!9 = !DIExpression()
				394	!10 = !DILocation(line: 1, column: 42, scope: !5)