AMDGPU: Use scalar operations for f16 fabs/fneg patterns
Fixes unnecessary differences between subtargets.
llvm-svn: 334184
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 67eae639..31c6061 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -908,7 +908,7 @@
def : GCNPat <
(fabs f32:$src),
- (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
+ (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fffffff)))
>;
def : GCNPat <
@@ -969,12 +969,12 @@
def : GCNPat <
(fneg f16:$src),
- (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
+ (S_XOR_B32 $src, (S_MOV_B32 (i32 0x00008000)))
>;
def : GCNPat <
(fabs f16:$src),
- (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
+ (S_AND_B32 $src, (S_MOV_B32 (i32 0x00007fff)))
>;
def : GCNPat <
@@ -984,12 +984,12 @@
def : GCNPat <
(fneg v2f16:$src),
- (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src)
+ (S_XOR_B32 $src, (S_MOV_B32 (i32 0x80008000)))
>;
def : GCNPat <
(fabs v2f16:$src),
- (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src)
+ (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fff7fff)))
>;
// This is really (fneg (fabs v2f16:$src))
@@ -998,12 +998,12 @@
// VOP3P instructions, so it is turned into the bit op.
def : GCNPat <
(fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
- (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
+ (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
>;
def : GCNPat <
(fneg (v2f16 (fabs v2f16:$src))),
- (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
+ (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
>;
/********** ================== **********/
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index e622398..e2c7b28 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -8,14 +8,9 @@
; GCN-LABEL: {{^}}s_fabs_free_f16:
; GCN: s_load_dword [[VAL:s[0-9]+]]
-
-; CI: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff
-; CI: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
-; CI: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
-
-; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff
-; GFX89: v_and_b32_e32 [[V_RESULT:v[0-9]+]], [[VAL]], [[MASK]]
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
+; GCN: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff
+; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
+; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
%bc= bitcast i16 %in to half
%fabs = call half @llvm.fabs.f16(half %bc)
@@ -25,14 +20,9 @@
; GCN-LABEL: {{^}}s_fabs_f16:
; GCN: s_load_dword [[VAL:s[0-9]+]]
-
-; CI: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff
-; CI: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
-; CI: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
-
-; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff
-; GFX89: v_and_b32_e32 [[V_RESULT:v[0-9]+]], [[VAL]], [[MASK]]
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
+; GCN: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff
+; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
+; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
%fabs = call half @llvm.fabs.f16(half %in)
store half %fabs, half addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 550ad79..3c6bdfa 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -7,37 +7,35 @@
; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
; unless isFabsFree returns true
-; FUNC-LABEL: {{^}}fabs_fn_free:
+; FUNC-LABEL: {{^}}s_fabs_fn_free:
; R600-NOT: AND
; R600: |PV.{{[XYZW]}}|
-; GCN: v_and_b32
-
-define amdgpu_kernel void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
%bc= bitcast i32 %in to float
%fabs = call float @fabs(float %bc)
store float %fabs, float addrspace(1)* %out
ret void
}
-; FUNC-LABEL: {{^}}fabs_free:
+; FUNC-LABEL: {{^}}s_fabs_free:
; R600-NOT: AND
; R600: |PV.{{[XYZW]}}|
-; GCN: v_and_b32
-
-define amdgpu_kernel void @fabs_free(float addrspace(1)* %out, i32 %in) {
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
%bc= bitcast i32 %in to float
%fabs = call float @llvm.fabs.f32(float %bc)
store float %fabs, float addrspace(1)* %out
ret void
}
-; FUNC-LABEL: {{^}}fabs_f32:
+; FUNC-LABEL: {{^}}s_fabs_f32:
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; GCN: v_and_b32
-define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float %in) {
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) {
%fabs = call float @llvm.fabs.f32(float %in)
store float %fabs, float addrspace(1)* %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 826fdb2..d5c7c8a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -88,11 +88,7 @@
; Combine turns this into integer op when bitcast source (from load)
; GCN-LABEL: {{^}}s_fneg_fabs_v2f16_bc_src:
-
-; FIXME: Random commute
-; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
-; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
-; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
+; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %out, <2 x half> %in) {
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
%fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
@@ -101,19 +97,9 @@
}
; GCN-LABEL: {{^}}fneg_fabs_v4f16:
-
-; FIXME: Random commute
; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
-
-; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
-; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
-
-; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
-; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
-
-; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
-; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
-
+; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
+; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
; GCN: {{flat|global}}_store_dwordx2
define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
%fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
@@ -145,8 +131,11 @@
; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_v2f16:
; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
-; GFX9: v_mov_b32_e32 [[VABS:v[0-9]+]], [[ABS]]
-; GFX9: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VABS]]
+; GFX9: v_mov_b32_e32 [[V_ABS:v[0-9]+]], [[ABS]]
+; GFX9: s_xor_b32 [[NEG:s[0-9]+]], [[ABS]], 0x80008000
+; GFX9-DAG: v_mov_b32_e32 [[V_NEG:v[0-9]+]], [[NEG]]
+; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_ABS]]
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_NEG]]
define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 0dc8544..5afcafc 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -30,14 +30,9 @@
; GCN-LABEL: {{^}}s_fneg_free_f16:
; GCN: s_load_dword [[NEG_VALUE:s[0-9]+]],
-
-; CI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
-; CI: v_mov_b32_e32 [[V_XOR:v[0-9]+]], [[XOR]]
-; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_XOR]]
-
-; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000
-; GFX89: v_xor_b32_e32 [[XOR:v[0-9]+]], [[NEG_VALUE]], [[MASK]]
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
+; GCN: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
+; GCN: v_mov_b32_e32 [[V_XOR:v[0-9]+]], [[XOR]]
+; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_XOR]]
define amdgpu_kernel void @s_fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
%bc = bitcast i16 %in to half
%fsub = fsub half -0.0, %bc
@@ -64,20 +59,16 @@
ret void
}
-; FIXME: scalar for VI, vector for gfx9
; GCN-LABEL: {{^}}s_fneg_v2f16:
-; CIVI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
-; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
+; GCN: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %in
store <2 x half> %fneg, <2 x half> addrspace(1)* %out
ret void
}
-; FIXME: vector on gfx9
; GCN-LABEL: {{^}}s_fneg_v2f16_nonload:
-; CIVI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
-; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
+; GCN: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
define amdgpu_kernel void @s_fneg_v2f16_nonload(<2 x half> addrspace(1)* %out) #0 {
%in = call i32 asm sideeffect "; def $0", "=s"()
%in.bc = bitcast i32 %in to <2 x half>
@@ -101,10 +92,7 @@
; GCN-LABEL: {{^}}fneg_free_v2f16:
; GCN: s_load_dword [[VAL:s[0-9]+]]
-; CIVI: s_xor_b32 s{{[0-9]+}}, [[VAL]], 0x80008000
-
-; GFX9: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VVAL]]
+; GCN: s_xor_b32 s{{[0-9]+}}, [[VAL]], 0x80008000
define amdgpu_kernel void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
%bc = bitcast i32 %in to <2 x half>
%fsub = fsub <2 x half> <half -0.0, half -0.0>, %bc