blob: 13fdd288f9d5a0af4fdc5b4cd21a3f0d9b00f17f [file] [log] [blame]
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +00001; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00004
5declare half @llvm.maxnum.f16(half %a, half %b)
6declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +00007declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b)
8declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00009
Matt Arsenault0c687392017-01-30 16:57:41 +000010; GCN-LABEL: {{^}}maxnum_f16:
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000011; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
12; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
13; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
14; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000015; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000016; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +000017; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000018; GCN: buffer_store_short v[[R_F16]]
19; GCN: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000020define amdgpu_kernel void @maxnum_f16(
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000021 half addrspace(1)* %r,
22 half addrspace(1)* %a,
23 half addrspace(1)* %b) {
24entry:
Matt Arsenault8c4a3522018-06-26 19:10:00 +000025 %a.val = load volatile half, half addrspace(1)* %a
26 %b.val = load volatile half, half addrspace(1)* %b
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000027 %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val)
28 store half %r.val, half addrspace(1)* %r
29 ret void
30}
31
Matt Arsenault0c687392017-01-30 16:57:41 +000032; GCN-LABEL: {{^}}maxnum_f16_imm_a:
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000033; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000034; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
Matt Arsenault0c687392017-01-30 16:57:41 +000035; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000036; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +000037; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000038; GCN: buffer_store_short v[[R_F16]]
39; GCN: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000040define amdgpu_kernel void @maxnum_f16_imm_a(
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000041 half addrspace(1)* %r,
42 half addrspace(1)* %b) {
43entry:
44 %b.val = load half, half addrspace(1)* %b
45 %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val)
46 store half %r.val, half addrspace(1)* %r
47 ret void
48}
49
Matt Arsenault0c687392017-01-30 16:57:41 +000050; GCN-LABEL: {{^}}maxnum_f16_imm_b:
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000051; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000052; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
Matt Arsenault0c687392017-01-30 16:57:41 +000053; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000054; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +000055; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000056; GCN: buffer_store_short v[[R_F16]]
57; GCN: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000058define amdgpu_kernel void @maxnum_f16_imm_b(
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000059 half addrspace(1)* %r,
60 half addrspace(1)* %a) {
61entry:
62 %a.val = load half, half addrspace(1)* %a
63 %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0)
64 store half %r.val, half addrspace(1)* %r
65 ret void
66}
67
Matt Arsenault0c687392017-01-30 16:57:41 +000068; GCN-LABEL: {{^}}maxnum_v2f16:
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000069; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
Matt Arsenault8c4a3522018-06-26 19:10:00 +000070; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
Matt Arsenault86e02ce2017-03-15 19:04:26 +000071
Matt Arsenault86e02ce2017-03-15 19:04:26 +000072; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
Matt Arsenault8c4a3522018-06-26 19:10:00 +000073; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
74; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
75; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
76; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
77; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
78; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000079; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
Sam Kolton9fa16962017-04-06 15:03:28 +000080; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
81; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
82; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
83; SI-NOT: and
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000084; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
Matt Arsenault86e02ce2017-03-15 19:04:26 +000085
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000086; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
Stanislav Mekhanoshin03306602017-06-03 17:39:47 +000087; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Sam Kolton9fa16962017-04-06 15:03:28 +000088; VI-NOT: and
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +000089; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
90
Matt Arsenault8c4a3522018-06-26 19:10:00 +000091; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
Matt Arsenault86e02ce2017-03-15 19:04:26 +000092
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000093; GCN: buffer_store_dword v[[R_V2_F16]]
94; GCN: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000095define amdgpu_kernel void @maxnum_v2f16(
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000096 <2 x half> addrspace(1)* %r,
97 <2 x half> addrspace(1)* %a,
98 <2 x half> addrspace(1)* %b) {
99entry:
100 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
101 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
102 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
103 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
104 ret void
105}
106
Matt Arsenault0c687392017-01-30 16:57:41 +0000107; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +0000108; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
Matt Arsenault8c4a3522018-06-26 19:10:00 +0000109; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
110; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
111; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
112; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
113; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
114; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
115; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +0000116; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
Stanislav Mekhanoshin03306602017-06-03 17:39:47 +0000117; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
Sam Kolton9fa16962017-04-06 15:03:28 +0000118; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
Matt Arsenault86e02ce2017-03-15 19:04:26 +0000119
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +0000120; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +0000121; SIVI-NOT: and
122; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
123
124
125; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
126; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
127
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000128; GCN: buffer_store_dword v[[R_V2_F16]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000129define amdgpu_kernel void @maxnum_v2f16_imm_a(
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000130 <2 x half> addrspace(1)* %r,
131 <2 x half> addrspace(1)* %b) {
132entry:
133 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
134 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
135 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
136 ret void
137}
138
Matt Arsenault0c687392017-01-30 16:57:41 +0000139; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +0000140; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
Matt Arsenault8c4a3522018-06-26 19:10:00 +0000141; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
142; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
143; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
144; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
145; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
146; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
147; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
Matt Arsenault70b92822017-11-12 23:53:44 +0000148
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +0000149; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
Matt Arsenault70b92822017-11-12 23:53:44 +0000150; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
Sam Kolton9fa16962017-04-06 15:03:28 +0000151; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
Matt Arsenault86e02ce2017-03-15 19:04:26 +0000152
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +0000153; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +0000154
155
156; SIVI-NOT: and
157; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
158
159; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
160; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
161
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000162; GCN: buffer_store_dword v[[R_V2_F16]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000163define amdgpu_kernel void @maxnum_v2f16_imm_b(
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000164 <2 x half> addrspace(1)* %r,
165 <2 x half> addrspace(1)* %a) {
166entry:
167 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
168 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
169 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
170 ret void
171}
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +0000172
173; FIXME: Scalarize with undef half
174; GCN-LABEL: {{^}}maxnum_v3f16:
175; GFX9: v_pk_max_f16
176; GFX9: v_pk_max_f16
177define amdgpu_kernel void @maxnum_v3f16(
178 <3 x half> addrspace(1)* %r,
179 <3 x half> addrspace(1)* %a,
180 <3 x half> addrspace(1)* %b) {
181entry:
182 %a.val = load <3 x half>, <3 x half> addrspace(1)* %a
183 %b.val = load <3 x half>, <3 x half> addrspace(1)* %b
184 %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
185 store <3 x half> %r.val, <3 x half> addrspace(1)* %r
186 ret void
187}
188
189; GCN-LABEL: {{^}}maxnum_v4f16:
190; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
191; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
Matt Arsenault8c4a3522018-06-26 19:10:00 +0000192; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
193; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +0000194; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
195define amdgpu_kernel void @maxnum_v4f16(
196 <4 x half> addrspace(1)* %r,
197 <4 x half> addrspace(1)* %a,
198 <4 x half> addrspace(1)* %b) {
199entry:
200 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
201 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
202 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
203 store <4 x half> %r.val, <4 x half> addrspace(1)* %r
204 ret void
205}
206
207; GCN-LABEL: {{^}}fmax_v4f16_imm_a:
208; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
209; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200
210; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800
211
212; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], [[K0]]
213; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], [[K1]]
214; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
215
216; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000
217; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400
218
219; VI-DAG: v_max_f16_sdwa v[[MAX_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
220; VI-DAG: v_max_f16_e32 v[[MAX_HI_LO:[0-9]+]], 0x4200, v[[A_HI]]
221; VI-DAG: v_max_f16_sdwa v[[MAX_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
222; VI-DAG: v_max_f16_e32 v[[MAX_LO_LO:[0-9]+]], 0x4800, v[[A_LO]]
223
224; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MAX_LO_LO]], v[[MAX_LO_HI]]
225; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MAX_HI_LO]], v[[MAX_HI_HI]]
226
227; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}
228define amdgpu_kernel void @fmax_v4f16_imm_a(
229 <4 x half> addrspace(1)* %r,
230 <4 x half> addrspace(1)* %b) {
231entry:
232 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
233 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
234 store <4 x half> %r.val, <4 x half> addrspace(1)* %r
235 ret void
236}