blob: 12573a5fee3b621b86748e2e077193b9336d6c2f [file] [log] [blame]
Matt Arsenault687ec752018-10-22 16:27:27 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SIVI %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00005
6declare half @llvm.maxnum.f16(half %a, half %b)
7declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +00008declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b)
9declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000010
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000011define amdgpu_kernel void @maxnum_f16(
Matt Arsenault687ec752018-10-22 16:27:27 +000012; SI-LABEL: maxnum_f16:
13; SI: ; %bb.0: ; %entry
14; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
15; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
16; SI-NEXT: s_mov_b32 s11, 0xf000
17; SI-NEXT: s_mov_b32 s10, -1
18; SI-NEXT: s_mov_b32 s2, s10
19; SI-NEXT: s_mov_b32 s3, s11
20; SI-NEXT: s_waitcnt lgkmcnt(0)
21; SI-NEXT: s_mov_b32 s12, s6
22; SI-NEXT: s_mov_b32 s13, s7
23; SI-NEXT: s_mov_b32 s14, s10
24; SI-NEXT: s_mov_b32 s15, s11
25; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
26; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0
27; SI-NEXT: s_mov_b32 s8, s4
28; SI-NEXT: s_mov_b32 s9, s5
29; SI-NEXT: s_waitcnt vmcnt(1)
30; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
31; SI-NEXT: s_waitcnt vmcnt(0)
32; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
33; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
34; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
35; SI-NEXT: v_max_f32_e32 v0, v0, v1
36; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
37; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
38; SI-NEXT: s_endpgm
39;
40; VI-LABEL: maxnum_f16:
41; VI: ; %bb.0: ; %entry
42; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
43; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
44; VI-NEXT: s_mov_b32 s3, 0xf000
45; VI-NEXT: s_mov_b32 s2, -1
46; VI-NEXT: s_mov_b32 s10, s2
47; VI-NEXT: s_waitcnt lgkmcnt(0)
48; VI-NEXT: s_mov_b32 s0, s4
49; VI-NEXT: s_mov_b32 s1, s5
50; VI-NEXT: s_mov_b32 s4, s6
51; VI-NEXT: s_mov_b32 s5, s7
52; VI-NEXT: s_mov_b32 s11, s3
53; VI-NEXT: s_mov_b32 s6, s2
54; VI-NEXT: s_mov_b32 s7, s3
55; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
56; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
57; VI-NEXT: s_waitcnt vmcnt(1)
58; VI-NEXT: v_max_f16_e32 v0, v0, v0
59; VI-NEXT: s_waitcnt vmcnt(0)
60; VI-NEXT: v_max_f16_e32 v1, v1, v1
61; VI-NEXT: v_max_f16_e32 v0, v0, v1
62; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
63; VI-NEXT: s_endpgm
64;
65; GFX9-LABEL: maxnum_f16:
66; GFX9: ; %bb.0: ; %entry
67; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
68; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
69; GFX9-NEXT: s_mov_b32 s3, 0xf000
70; GFX9-NEXT: s_mov_b32 s2, -1
71; GFX9-NEXT: s_mov_b32 s10, s2
72; GFX9-NEXT: s_waitcnt lgkmcnt(0)
73; GFX9-NEXT: s_mov_b32 s0, s4
74; GFX9-NEXT: s_mov_b32 s1, s5
75; GFX9-NEXT: s_mov_b32 s4, s6
76; GFX9-NEXT: s_mov_b32 s5, s7
77; GFX9-NEXT: s_mov_b32 s11, s3
78; GFX9-NEXT: s_mov_b32 s6, s2
79; GFX9-NEXT: s_mov_b32 s7, s3
80; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
81; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
82; GFX9-NEXT: s_waitcnt vmcnt(1)
83; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
84; GFX9-NEXT: s_waitcnt vmcnt(0)
85; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
86; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
87; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
88; GFX9-NEXT: s_endpgm
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000089 half addrspace(1)* %r,
90 half addrspace(1)* %a,
91 half addrspace(1)* %b) {
92entry:
Matt Arsenault8c4a3522018-06-26 19:10:00 +000093 %a.val = load volatile half, half addrspace(1)* %a
94 %b.val = load volatile half, half addrspace(1)* %b
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000095 %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val)
96 store half %r.val, half addrspace(1)* %r
97 ret void
98}
99
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000100define amdgpu_kernel void @maxnum_f16_imm_a(
Matt Arsenault687ec752018-10-22 16:27:27 +0000101; SI-LABEL: maxnum_f16_imm_a:
102; SI: ; %bb.0: ; %entry
103; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
104; SI-NEXT: s_mov_b32 s3, 0xf000
105; SI-NEXT: s_mov_b32 s2, -1
106; SI-NEXT: s_mov_b32 s10, s2
107; SI-NEXT: s_mov_b32 s11, s3
108; SI-NEXT: s_waitcnt lgkmcnt(0)
109; SI-NEXT: s_mov_b32 s8, s6
110; SI-NEXT: s_mov_b32 s9, s7
111; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
112; SI-NEXT: s_mov_b32 s0, s4
113; SI-NEXT: s_mov_b32 s1, s5
114; SI-NEXT: s_waitcnt vmcnt(0)
115; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
116; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
117; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
118; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
119; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
120; SI-NEXT: s_endpgm
121;
122; VI-LABEL: maxnum_f16_imm_a:
123; VI: ; %bb.0: ; %entry
124; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
125; VI-NEXT: s_mov_b32 s3, 0xf000
126; VI-NEXT: s_mov_b32 s2, -1
127; VI-NEXT: s_waitcnt lgkmcnt(0)
128; VI-NEXT: s_mov_b32 s0, s4
129; VI-NEXT: s_mov_b32 s1, s5
130; VI-NEXT: s_mov_b32 s4, s6
131; VI-NEXT: s_mov_b32 s5, s7
132; VI-NEXT: s_mov_b32 s6, s2
133; VI-NEXT: s_mov_b32 s7, s3
134; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
135; VI-NEXT: s_waitcnt vmcnt(0)
136; VI-NEXT: v_max_f16_e32 v0, v0, v0
137; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0
138; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
139; VI-NEXT: s_endpgm
140;
141; GFX9-LABEL: maxnum_f16_imm_a:
142; GFX9: ; %bb.0: ; %entry
143; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
144; GFX9-NEXT: s_mov_b32 s3, 0xf000
145; GFX9-NEXT: s_mov_b32 s2, -1
146; GFX9-NEXT: s_waitcnt lgkmcnt(0)
147; GFX9-NEXT: s_mov_b32 s0, s4
148; GFX9-NEXT: s_mov_b32 s1, s5
149; GFX9-NEXT: s_mov_b32 s4, s6
150; GFX9-NEXT: s_mov_b32 s5, s7
151; GFX9-NEXT: s_mov_b32 s6, s2
152; GFX9-NEXT: s_mov_b32 s7, s3
153; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
154; GFX9-NEXT: s_waitcnt vmcnt(0)
155; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
156; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0
157; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
158; GFX9-NEXT: s_endpgm
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000159 half addrspace(1)* %r,
160 half addrspace(1)* %b) {
161entry:
162 %b.val = load half, half addrspace(1)* %b
163 %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val)
164 store half %r.val, half addrspace(1)* %r
165 ret void
166}
167
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000168define amdgpu_kernel void @maxnum_f16_imm_b(
Matt Arsenault687ec752018-10-22 16:27:27 +0000169; SI-LABEL: maxnum_f16_imm_b:
170; SI: ; %bb.0: ; %entry
171; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
172; SI-NEXT: s_mov_b32 s3, 0xf000
173; SI-NEXT: s_mov_b32 s2, -1
174; SI-NEXT: s_mov_b32 s10, s2
175; SI-NEXT: s_mov_b32 s11, s3
176; SI-NEXT: s_waitcnt lgkmcnt(0)
177; SI-NEXT: s_mov_b32 s8, s6
178; SI-NEXT: s_mov_b32 s9, s7
179; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
180; SI-NEXT: s_mov_b32 s0, s4
181; SI-NEXT: s_mov_b32 s1, s5
182; SI-NEXT: s_waitcnt vmcnt(0)
183; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
184; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
185; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
186; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
187; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
188; SI-NEXT: s_endpgm
189;
190; VI-LABEL: maxnum_f16_imm_b:
191; VI: ; %bb.0: ; %entry
192; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
193; VI-NEXT: s_mov_b32 s3, 0xf000
194; VI-NEXT: s_mov_b32 s2, -1
195; VI-NEXT: s_waitcnt lgkmcnt(0)
196; VI-NEXT: s_mov_b32 s0, s4
197; VI-NEXT: s_mov_b32 s1, s5
198; VI-NEXT: s_mov_b32 s4, s6
199; VI-NEXT: s_mov_b32 s5, s7
200; VI-NEXT: s_mov_b32 s6, s2
201; VI-NEXT: s_mov_b32 s7, s3
202; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
203; VI-NEXT: s_waitcnt vmcnt(0)
204; VI-NEXT: v_max_f16_e32 v0, v0, v0
205; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
206; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
207; VI-NEXT: s_endpgm
208;
209; GFX9-LABEL: maxnum_f16_imm_b:
210; GFX9: ; %bb.0: ; %entry
211; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
212; GFX9-NEXT: s_mov_b32 s3, 0xf000
213; GFX9-NEXT: s_mov_b32 s2, -1
214; GFX9-NEXT: s_waitcnt lgkmcnt(0)
215; GFX9-NEXT: s_mov_b32 s0, s4
216; GFX9-NEXT: s_mov_b32 s1, s5
217; GFX9-NEXT: s_mov_b32 s4, s6
218; GFX9-NEXT: s_mov_b32 s5, s7
219; GFX9-NEXT: s_mov_b32 s6, s2
220; GFX9-NEXT: s_mov_b32 s7, s3
221; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
222; GFX9-NEXT: s_waitcnt vmcnt(0)
223; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
224; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0
225; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
226; GFX9-NEXT: s_endpgm
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000227 half addrspace(1)* %r,
228 half addrspace(1)* %a) {
229entry:
230 %a.val = load half, half addrspace(1)* %a
231 %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0)
232 store half %r.val, half addrspace(1)* %r
233 ret void
234}
235
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000236define amdgpu_kernel void @maxnum_v2f16(
Matt Arsenault687ec752018-10-22 16:27:27 +0000237; SI-LABEL: maxnum_v2f16:
238; SI: ; %bb.0: ; %entry
239; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
240; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
241; SI-NEXT: s_mov_b32 s3, 0xf000
242; SI-NEXT: s_mov_b32 s2, -1
243; SI-NEXT: s_waitcnt lgkmcnt(0)
244; SI-NEXT: s_load_dword s6, s[6:7], 0x0
245; SI-NEXT: s_load_dword s0, s[0:1], 0x0
246; SI-NEXT: s_waitcnt lgkmcnt(0)
247; SI-NEXT: s_lshr_b32 s1, s6, 16
248; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
249; SI-NEXT: s_lshr_b32 s0, s0, 16
250; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
251; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
252; SI-NEXT: v_cvt_f32_f16_e32 v0, s6
253; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
254; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
255; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
256; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
257; SI-NEXT: v_max_f32_e32 v2, v3, v2
258; SI-NEXT: v_max_f32_e32 v0, v0, v1
259; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
260; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
261; SI-NEXT: s_mov_b32 s0, s4
262; SI-NEXT: s_mov_b32 s1, s5
263; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
264; SI-NEXT: v_or_b32_e32 v0, v0, v1
265; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
266; SI-NEXT: s_endpgm
267;
268; VI-LABEL: maxnum_v2f16:
269; VI: ; %bb.0: ; %entry
270; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
271; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
272; VI-NEXT: s_mov_b32 s3, 0xf000
273; VI-NEXT: s_mov_b32 s2, -1
274; VI-NEXT: s_waitcnt lgkmcnt(0)
275; VI-NEXT: s_mov_b32 s0, s4
276; VI-NEXT: s_mov_b32 s1, s5
277; VI-NEXT: s_load_dword s4, s[6:7], 0x0
278; VI-NEXT: s_load_dword s5, s[8:9], 0x0
279; VI-NEXT: s_waitcnt lgkmcnt(0)
280; VI-NEXT: v_max_f16_e64 v1, s4, s4
281; VI-NEXT: v_max_f16_e64 v0, s5, s5
282; VI-NEXT: s_lshr_b32 s4, s4, 16
283; VI-NEXT: s_lshr_b32 s5, s5, 16
284; VI-NEXT: v_max_f16_e32 v0, v1, v0
285; VI-NEXT: v_max_f16_e64 v1, s5, s5
286; VI-NEXT: v_max_f16_e64 v2, s4, s4
287; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
288; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
289; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
290; VI-NEXT: s_endpgm
291;
292; GFX9-LABEL: maxnum_v2f16:
293; GFX9: ; %bb.0: ; %entry
294; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
295; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
296; GFX9-NEXT: s_mov_b32 s3, 0xf000
297; GFX9-NEXT: s_mov_b32 s2, -1
298; GFX9-NEXT: s_waitcnt lgkmcnt(0)
299; GFX9-NEXT: s_mov_b32 s0, s4
300; GFX9-NEXT: s_mov_b32 s1, s5
301; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
302; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0
303; GFX9-NEXT: s_waitcnt lgkmcnt(0)
304; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
305; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
306; GFX9-NEXT: v_pk_max_f16 v0, v1, v0
307; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
308; GFX9-NEXT: s_endpgm
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000309 <2 x half> addrspace(1)* %r,
310 <2 x half> addrspace(1)* %a,
311 <2 x half> addrspace(1)* %b) {
312entry:
313 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
314 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
315 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
316 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
317 ret void
318}
319
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000320define amdgpu_kernel void @maxnum_v2f16_imm_a(
Matt Arsenault687ec752018-10-22 16:27:27 +0000321; SI-LABEL: maxnum_v2f16_imm_a:
322; SI: ; %bb.0: ; %entry
323; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
324; SI-NEXT: s_waitcnt lgkmcnt(0)
325; SI-NEXT: s_load_dword s2, s[2:3], 0x0
326; SI-NEXT: s_mov_b32 s3, 0xf000
327; SI-NEXT: s_waitcnt lgkmcnt(0)
328; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
329; SI-NEXT: s_lshr_b32 s2, s2, 16
330; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
331; SI-NEXT: s_mov_b32 s2, -1
332; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
333; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
334; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
335; SI-NEXT: v_max_f32_e32 v1, 4.0, v1
336; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
337; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
338; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
339; SI-NEXT: v_or_b32_e32 v0, v0, v1
340; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
341; SI-NEXT: s_endpgm
342;
343; VI-LABEL: maxnum_v2f16_imm_a:
344; VI: ; %bb.0: ; %entry
345; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
346; VI-NEXT: v_mov_b32_e32 v2, 0x4400
347; VI-NEXT: s_mov_b32 s3, 0xf000
348; VI-NEXT: s_mov_b32 s2, -1
349; VI-NEXT: s_waitcnt lgkmcnt(0)
350; VI-NEXT: s_mov_b32 s0, s4
351; VI-NEXT: s_load_dword s4, s[6:7], 0x0
352; VI-NEXT: s_mov_b32 s1, s5
353; VI-NEXT: s_waitcnt lgkmcnt(0)
354; VI-NEXT: v_max_f16_e64 v0, s4, s4
355; VI-NEXT: s_lshr_b32 s4, s4, 16
356; VI-NEXT: v_max_f16_e64 v1, s4, s4
357; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0
358; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
359; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
360; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
361; VI-NEXT: s_endpgm
362;
363; GFX9-LABEL: maxnum_v2f16_imm_a:
364; GFX9: ; %bb.0: ; %entry
365; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
366; GFX9-NEXT: s_mov_b32 s3, 0xf000
367; GFX9-NEXT: s_mov_b32 s2, -1
368; GFX9-NEXT: s_waitcnt lgkmcnt(0)
369; GFX9-NEXT: s_mov_b32 s0, s4
370; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
371; GFX9-NEXT: s_mov_b32 s1, s5
372; GFX9-NEXT: s_waitcnt lgkmcnt(0)
373; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
374; GFX9-NEXT: s_mov_b32 s4, 0x44004200
375; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
376; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
377; GFX9-NEXT: s_endpgm
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000378 <2 x half> addrspace(1)* %r,
379 <2 x half> addrspace(1)* %b) {
380entry:
381 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
382 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
383 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
384 ret void
385}
386
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000387define amdgpu_kernel void @maxnum_v2f16_imm_b(
Matt Arsenault687ec752018-10-22 16:27:27 +0000388; SI-LABEL: maxnum_v2f16_imm_b:
389; SI: ; %bb.0: ; %entry
390; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
391; SI-NEXT: s_waitcnt lgkmcnt(0)
392; SI-NEXT: s_load_dword s2, s[2:3], 0x0
393; SI-NEXT: s_mov_b32 s3, 0xf000
394; SI-NEXT: s_waitcnt lgkmcnt(0)
395; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
396; SI-NEXT: s_lshr_b32 s2, s2, 16
397; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
398; SI-NEXT: s_mov_b32 s2, -1
399; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
400; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
401; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
402; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
403; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
404; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
405; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
406; SI-NEXT: v_or_b32_e32 v0, v0, v1
407; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
408; SI-NEXT: s_endpgm
409;
410; VI-LABEL: maxnum_v2f16_imm_b:
411; VI: ; %bb.0: ; %entry
412; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
413; VI-NEXT: v_mov_b32_e32 v2, 0x4200
414; VI-NEXT: s_mov_b32 s3, 0xf000
415; VI-NEXT: s_mov_b32 s2, -1
416; VI-NEXT: s_waitcnt lgkmcnt(0)
417; VI-NEXT: s_mov_b32 s0, s4
418; VI-NEXT: s_load_dword s4, s[6:7], 0x0
419; VI-NEXT: s_mov_b32 s1, s5
420; VI-NEXT: s_waitcnt lgkmcnt(0)
421; VI-NEXT: v_max_f16_e64 v0, s4, s4
422; VI-NEXT: s_lshr_b32 s4, s4, 16
423; VI-NEXT: v_max_f16_e64 v1, s4, s4
424; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
425; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
426; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
427; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
428; VI-NEXT: s_endpgm
429;
430; GFX9-LABEL: maxnum_v2f16_imm_b:
431; GFX9: ; %bb.0: ; %entry
432; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
433; GFX9-NEXT: s_mov_b32 s3, 0xf000
434; GFX9-NEXT: s_mov_b32 s2, -1
435; GFX9-NEXT: s_waitcnt lgkmcnt(0)
436; GFX9-NEXT: s_mov_b32 s0, s4
437; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
438; GFX9-NEXT: s_mov_b32 s1, s5
439; GFX9-NEXT: s_waitcnt lgkmcnt(0)
440; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
441; GFX9-NEXT: s_mov_b32 s4, 0x42004400
442; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
443; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
444; GFX9-NEXT: s_endpgm
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000445 <2 x half> addrspace(1)* %r,
446 <2 x half> addrspace(1)* %a) {
447entry:
448 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
449 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
450 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
451 ret void
452}
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +0000453
454; FIXME: Scalarize with undef half
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +0000455define amdgpu_kernel void @maxnum_v3f16(
Matt Arsenault687ec752018-10-22 16:27:27 +0000456; SI-LABEL: maxnum_v3f16:
457; SI: ; %bb.0: ; %entry
458; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
459; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
460; SI-NEXT: s_mov_b32 s3, 0xf000
461; SI-NEXT: s_mov_b32 s2, -1
462; SI-NEXT: s_waitcnt lgkmcnt(0)
463; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
464; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
465; SI-NEXT: s_mov_b32 s0, s4
466; SI-NEXT: s_waitcnt lgkmcnt(0)
467; SI-NEXT: s_lshr_b32 s1, s6, 16
468; SI-NEXT: s_lshr_b32 s4, s8, 16
469; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
470; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
471; SI-NEXT: v_cvt_f32_f16_e32 v1, s6
472; SI-NEXT: v_cvt_f32_f16_e32 v5, s8
473; SI-NEXT: v_cvt_f32_f16_e32 v0, s7
474; SI-NEXT: v_cvt_f32_f16_e32 v4, s9
475; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
476; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
477; SI-NEXT: v_max_f32_e32 v2, v3, v2
478; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
479; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
480; SI-NEXT: v_max_f32_e32 v1, v1, v3
481; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
482; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
483; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
484; SI-NEXT: v_max_f32_e32 v0, v0, v3
485; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
486; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
487; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
488; SI-NEXT: s_mov_b32 s1, s5
489; SI-NEXT: v_or_b32_e32 v1, v1, v2
490; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
491; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
492; SI-NEXT: s_endpgm
493;
494; VI-LABEL: maxnum_v3f16:
495; VI: ; %bb.0: ; %entry
496; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
497; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
498; VI-NEXT: s_mov_b32 s3, 0xf000
499; VI-NEXT: s_mov_b32 s2, -1
500; VI-NEXT: s_waitcnt lgkmcnt(0)
501; VI-NEXT: s_mov_b32 s0, s4
502; VI-NEXT: s_mov_b32 s1, s5
503; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
504; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
505; VI-NEXT: s_waitcnt lgkmcnt(0)
506; VI-NEXT: v_max_f16_e64 v1, s4, s4
507; VI-NEXT: v_max_f16_e64 v0, s6, s6
508; VI-NEXT: s_lshr_b32 s4, s4, 16
509; VI-NEXT: s_lshr_b32 s6, s6, 16
510; VI-NEXT: v_max_f16_e32 v0, v1, v0
511; VI-NEXT: v_max_f16_e64 v1, s6, s6
512; VI-NEXT: v_max_f16_e64 v2, s4, s4
513; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
514; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
515; VI-NEXT: v_max_f16_e64 v1, s7, s7
516; VI-NEXT: v_max_f16_e64 v2, s5, s5
517; VI-NEXT: v_max_f16_e32 v1, v2, v1
518; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
519; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
520; VI-NEXT: s_endpgm
521;
522; GFX9-LABEL: maxnum_v3f16:
523; GFX9: ; %bb.0: ; %entry
524; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
525; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
526; GFX9-NEXT: s_mov_b32 s3, 0xf000
527; GFX9-NEXT: s_mov_b32 s2, -1
528; GFX9-NEXT: s_waitcnt lgkmcnt(0)
529; GFX9-NEXT: s_mov_b32 s0, s4
530; GFX9-NEXT: s_mov_b32 s1, s5
531; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
532; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
533; GFX9-NEXT: s_waitcnt lgkmcnt(0)
534; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
535; GFX9-NEXT: v_pk_max_f16 v0, s6, s6
536; GFX9-NEXT: v_pk_max_f16 v0, v1, v0
537; GFX9-NEXT: v_pk_max_f16 v2, s7, s7
538; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
539; GFX9-NEXT: v_pk_max_f16 v1, v1, v2
540; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
541; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
542; GFX9-NEXT: s_endpgm
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +0000543 <3 x half> addrspace(1)* %r,
544 <3 x half> addrspace(1)* %a,
545 <3 x half> addrspace(1)* %b) {
546entry:
547 %a.val = load <3 x half>, <3 x half> addrspace(1)* %a
548 %b.val = load <3 x half>, <3 x half> addrspace(1)* %b
549 %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
550 store <3 x half> %r.val, <3 x half> addrspace(1)* %r
551 ret void
552}
553
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +0000554define amdgpu_kernel void @maxnum_v4f16(
Matt Arsenault687ec752018-10-22 16:27:27 +0000555; SI-LABEL: maxnum_v4f16:
556; SI: ; %bb.0: ; %entry
557; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
558; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
559; SI-NEXT: s_mov_b32 s3, 0xf000
560; SI-NEXT: s_mov_b32 s2, -1
561; SI-NEXT: s_waitcnt lgkmcnt(0)
562; SI-NEXT: s_mov_b32 s0, s4
563; SI-NEXT: s_mov_b32 s1, s5
564; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
565; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
566; SI-NEXT: s_waitcnt lgkmcnt(0)
567; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
568; SI-NEXT: s_lshr_b32 s4, s4, 16
569; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
570; SI-NEXT: s_lshr_b32 s4, s5, 16
571; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
572; SI-NEXT: s_lshr_b32 s4, s7, 16
573; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
574; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
575; SI-NEXT: s_lshr_b32 s4, s6, 16
576; SI-NEXT: v_cvt_f32_f16_e32 v7, s7
577; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
578; SI-NEXT: v_cvt_f32_f16_e32 v4, s6
579; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
580; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
581; SI-NEXT: v_max_f32_e32 v3, v3, v5
582; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7
583; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
584; SI-NEXT: v_max_f32_e32 v1, v1, v5
585; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
586; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
587; SI-NEXT: v_max_f32_e32 v2, v2, v5
588; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
589; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
590; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
591; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
592; SI-NEXT: v_max_f32_e32 v0, v0, v4
593; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
594; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
595; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
596; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
597; SI-NEXT: v_or_b32_e32 v1, v1, v3
598; SI-NEXT: v_or_b32_e32 v0, v0, v2
599; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
600; SI-NEXT: s_endpgm
601;
602; VI-LABEL: maxnum_v4f16:
603; VI: ; %bb.0: ; %entry
604; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
605; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
606; VI-NEXT: s_mov_b32 s3, 0xf000
607; VI-NEXT: s_mov_b32 s2, -1
608; VI-NEXT: s_waitcnt lgkmcnt(0)
609; VI-NEXT: s_mov_b32 s0, s4
610; VI-NEXT: s_mov_b32 s1, s5
611; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
612; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
613; VI-NEXT: s_waitcnt lgkmcnt(0)
614; VI-NEXT: v_max_f16_e64 v1, s5, s5
615; VI-NEXT: v_max_f16_e64 v0, s7, s7
616; VI-NEXT: s_lshr_b32 s5, s5, 16
617; VI-NEXT: s_lshr_b32 s7, s7, 16
618; VI-NEXT: v_max_f16_e32 v0, v1, v0
619; VI-NEXT: v_max_f16_e64 v2, s5, s5
620; VI-NEXT: v_max_f16_e64 v1, s7, s7
621; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
622; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
623; VI-NEXT: v_max_f16_e64 v2, s4, s4
624; VI-NEXT: v_max_f16_e64 v0, s6, s6
625; VI-NEXT: s_lshr_b32 s4, s4, 16
626; VI-NEXT: s_lshr_b32 s5, s6, 16
627; VI-NEXT: v_max_f16_e32 v0, v2, v0
628; VI-NEXT: v_max_f16_e64 v2, s5, s5
629; VI-NEXT: v_max_f16_e64 v3, s4, s4
630; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
631; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
632; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
633; VI-NEXT: s_endpgm
634;
635; GFX9-LABEL: maxnum_v4f16:
636; GFX9: ; %bb.0: ; %entry
637; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
638; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
639; GFX9-NEXT: s_mov_b32 s3, 0xf000
640; GFX9-NEXT: s_mov_b32 s2, -1
641; GFX9-NEXT: s_waitcnt lgkmcnt(0)
642; GFX9-NEXT: s_mov_b32 s0, s4
643; GFX9-NEXT: s_mov_b32 s1, s5
644; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
645; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
646; GFX9-NEXT: s_waitcnt lgkmcnt(0)
647; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
648; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
649; GFX9-NEXT: v_pk_max_f16 v1, v1, v0
650; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
651; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
652; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
653; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
654; GFX9-NEXT: s_endpgm
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +0000655 <4 x half> addrspace(1)* %r,
656 <4 x half> addrspace(1)* %a,
657 <4 x half> addrspace(1)* %b) {
658entry:
659 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
660 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
661 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
662 store <4 x half> %r.val, <4 x half> addrspace(1)* %r
663 ret void
664}
665
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +0000666define amdgpu_kernel void @fmax_v4f16_imm_a(
Matt Arsenault687ec752018-10-22 16:27:27 +0000667; SI-LABEL: fmax_v4f16_imm_a:
668; SI: ; %bb.0: ; %entry
669; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
670; SI-NEXT: s_mov_b32 s3, 0xf000
671; SI-NEXT: s_mov_b32 s2, -1
672; SI-NEXT: s_waitcnt lgkmcnt(0)
673; SI-NEXT: s_mov_b32 s0, s4
674; SI-NEXT: s_mov_b32 s1, s5
675; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
676; SI-NEXT: s_waitcnt lgkmcnt(0)
677; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
678; SI-NEXT: s_lshr_b32 s5, s5, 16
679; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
680; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
681; SI-NEXT: s_lshr_b32 s4, s4, 16
682; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
683; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
684; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
685; SI-NEXT: v_max_f32_e32 v2, 4.0, v2
686; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
687; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
688; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
689; SI-NEXT: v_max_f32_e32 v3, 2.0, v3
690; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
691; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0
692; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
693; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
694; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
695; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
696; SI-NEXT: v_or_b32_e32 v1, v1, v2
697; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
698; SI-NEXT: v_or_b32_e32 v0, v0, v2
699; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
700; SI-NEXT: s_endpgm
701;
702; VI-LABEL: fmax_v4f16_imm_a:
703; VI: ; %bb.0: ; %entry
704; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
705; VI-NEXT: v_mov_b32_e32 v0, 0x4400
706; VI-NEXT: s_mov_b32 s3, 0xf000
707; VI-NEXT: s_mov_b32 s2, -1
708; VI-NEXT: s_waitcnt lgkmcnt(0)
709; VI-NEXT: s_mov_b32 s0, s4
710; VI-NEXT: s_mov_b32 s1, s5
711; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
712; VI-NEXT: s_waitcnt lgkmcnt(0)
713; VI-NEXT: v_max_f16_e64 v1, s5, s5
714; VI-NEXT: s_lshr_b32 s5, s5, 16
715; VI-NEXT: v_max_f16_e64 v3, s5, s5
716; VI-NEXT: v_max_f16_e64 v2, s4, s4
717; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
718; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1
719; VI-NEXT: s_lshr_b32 s4, s4, 16
720; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
721; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2
722; VI-NEXT: v_max_f16_e64 v2, s4, s4
723; VI-NEXT: v_mov_b32_e32 v3, 0x4000
724; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
725; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
726; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
727; VI-NEXT: s_endpgm
728;
729; GFX9-LABEL: fmax_v4f16_imm_a:
730; GFX9: ; %bb.0: ; %entry
731; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
732; GFX9-NEXT: s_mov_b32 s8, 0x44004200
733; GFX9-NEXT: s_mov_b32 s9, 0x40004800
734; GFX9-NEXT: s_mov_b32 s3, 0xf000
735; GFX9-NEXT: s_mov_b32 s2, -1
736; GFX9-NEXT: s_waitcnt lgkmcnt(0)
737; GFX9-NEXT: s_mov_b32 s0, s4
738; GFX9-NEXT: s_mov_b32 s1, s5
739; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
740; GFX9-NEXT: s_waitcnt lgkmcnt(0)
741; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
742; GFX9-NEXT: v_pk_max_f16 v2, s4, s4
743; GFX9-NEXT: v_pk_max_f16 v1, v0, s8
744; GFX9-NEXT: v_pk_max_f16 v0, v2, s9
745; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
746; GFX9-NEXT: s_endpgm
Matt Arsenaultf0c5c6b2018-05-22 20:42:00 +0000747 <4 x half> addrspace(1)* %r,
748 <4 x half> addrspace(1)* %b) {
749entry:
750 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
751 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
752 store <4 x half> %r.val, <4 x half> addrspace(1)* %r
753 ret void
754}