AMDGPU: Invert cmp + select with constant
Canonicalize a select with a constant to the false side. This
enables more instruction shrinking opportunities since an
inline immediate can be used for the false side of v_cndmask_b32_e32.
This seems to usually be better but causes some code size regressions
in some tests.
llvm-svn: 290372
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index eadec5c..84084c9 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -102,12 +102,12 @@
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
-; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
+; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
+; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[C_F32]], v[[D_F32]], vcc
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}}
-; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
+; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define void @select_f16_imm_c(
@@ -262,12 +262,18 @@
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
-; SI: v_cmp_lt_f32_e32
-; SI: v_cmp_lt_f32_e64
-; VI: v_cmp_lt_f16_e32
-; VI: v_cmp_lt_f16_e64
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e64
+
+; SI: v_cmp_lt_f32_e32
+; SI: v_cmp_lt_f32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e64
+
+; VI: v_cmp_nlt_f16_e32
+; VI: v_cndmask_b32_e32
+
+; VI: v_cmp_nlt_f16_e32
+; VI: v_cndmask_b32_e32
+
; SI: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32
; GCN: s_endpgm