blob: 478658c3acc8862dc5caf5cd76e2c9d3a852f283 [file] [log] [blame]
Matt Arsenault70b92822017-11-12 23:53:44 +00001; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +00005
6declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
7declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
8
9; GCN-LABEL: {{^}}fmuladd_f16
10; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
11; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
12; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
13; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
14; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
15; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000016; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000017; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
18; SI: buffer_store_short v[[R_F16]]
Matt Arsenault9e22bc22016-12-22 03:21:48 +000019
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000020; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
Matt Arsenault9e22bc22016-12-22 03:21:48 +000021; VI-FLUSH: buffer_store_short v[[C_F16]]
22
23; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
24; VI-DENORM: buffer_store_short [[RESULT]]
25
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000026; GCN: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000027define amdgpu_kernel void @fmuladd_f16(
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000028 half addrspace(1)* %r,
29 half addrspace(1)* %a,
30 half addrspace(1)* %b,
31 half addrspace(1)* %c) {
32 %a.val = load half, half addrspace(1)* %a
33 %b.val = load half, half addrspace(1)* %b
34 %c.val = load half, half addrspace(1)* %c
35 %r.val = call half @llvm.fmuladd.f16(half %a.val, half %b.val, half %c.val)
36 store half %r.val, half addrspace(1)* %r
37 ret void
38}
39
40; GCN-LABEL: {{^}}fmuladd_f16_imm_a
41; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
42; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000043; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
44; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
Matt Arsenault0c687392017-01-30 16:57:41 +000045; SI: v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[B_F32]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000046; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
47; SI: buffer_store_short v[[R_F16]]
Matt Arsenault9e22bc22016-12-22 03:21:48 +000048
49; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]]
50; VI-FLUSH: buffer_store_short v[[C_F16]]
51
52; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +000053; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]]
Matt Arsenault9e22bc22016-12-22 03:21:48 +000054; VI-DENORM: buffer_store_short [[RESULT]]
55
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000056; GCN: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000057define amdgpu_kernel void @fmuladd_f16_imm_a(
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000058 half addrspace(1)* %r,
59 half addrspace(1)* %b,
60 half addrspace(1)* %c) {
61 %b.val = load half, half addrspace(1)* %b
62 %c.val = load half, half addrspace(1)* %c
63 %r.val = call half @llvm.fmuladd.f16(half 3.0, half %b.val, half %c.val)
64 store half %r.val, half addrspace(1)* %r
65 ret void
66}
67
68; GCN-LABEL: {{^}}fmuladd_f16_imm_b
69; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
70; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000071; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
72; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
Matt Arsenault70b92822017-11-12 23:53:44 +000073; SI: v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[A_F32]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000074; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
75; SI: buffer_store_short v[[R_F16]]
Matt Arsenault9e22bc22016-12-22 03:21:48 +000076
77; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]]
78; VI-FLUSH: buffer_store_short v[[C_F16]]
79
80; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +000081; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]]
Matt Arsenault9e22bc22016-12-22 03:21:48 +000082; VI-DENORM buffer_store_short [[RESULT]]
83
84
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000085; GCN: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000086define amdgpu_kernel void @fmuladd_f16_imm_b(
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000087 half addrspace(1)* %r,
88 half addrspace(1)* %a,
89 half addrspace(1)* %c) {
90 %a.val = load half, half addrspace(1)* %a
91 %c.val = load half, half addrspace(1)* %c
92 %r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val)
93 store half %r.val, half addrspace(1)* %r
94 ret void
95}
96
97; GCN-LABEL: {{^}}fmuladd_v2f16
Stanislav Mekhanoshind4ae4702017-09-19 20:54:38 +000098; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +000099; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
Stanislav Mekhanoshind4ae4702017-09-19 20:54:38 +0000100; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000101; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
Matt Arsenault86e02ce2017-03-15 19:04:26 +0000102
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000103; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
Matt Arsenault86e02ce2017-03-15 19:04:26 +0000104; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
105
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000106; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
Matt Arsenault86e02ce2017-03-15 19:04:26 +0000107; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
108; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
109; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
110
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000111; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
112; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
113; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000114; SI: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
115; SI: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000116; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
Matt Arsenault86e02ce2017-03-15 19:04:26 +0000117; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000118; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000119; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
Matt Arsenault9e22bc22016-12-22 03:21:48 +0000120
Sam Kolton9fa16962017-04-06 15:03:28 +0000121; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
Stanislav Mekhanoshin03306602017-06-03 17:39:47 +0000122; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000123; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
Sam Kolton9fa16962017-04-06 15:03:28 +0000124; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
Matt Arsenault8edfaee2017-03-31 19:53:03 +0000125; VI-FLUSH-NOT: v_and_b32
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000126; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]]
Matt Arsenault9e22bc22016-12-22 03:21:48 +0000127
Stanislav Mekhanoshind4ae4702017-09-19 20:54:38 +0000128; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
129; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
130; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
131; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]], v[[C_V2_F16]]
132; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]], v[[C_F16_1]]
Matt Arsenault8edfaee2017-03-31 19:53:03 +0000133; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
134; VI-DENORM-NOT: v_and_b32
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000135; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
Matt Arsenault9e22bc22016-12-22 03:21:48 +0000136
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000137; GCN: buffer_store_dword v[[R_V2_F16]]
138; GCN: s_endpgm
Sam Kolton9fa16962017-04-06 15:03:28 +0000139
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000140define amdgpu_kernel void @fmuladd_v2f16(
Konstantin Zhuravlyovf86e4b72016-11-13 07:01:11 +0000141 <2 x half> addrspace(1)* %r,
142 <2 x half> addrspace(1)* %a,
143 <2 x half> addrspace(1)* %b,
144 <2 x half> addrspace(1)* %c) {
145 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
146 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
147 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
148 %r.val = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val)
149 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
150 ret void
151}