AMDGPU: Fix immediate folding logic when shrinking instructions

If the literal is being folded into src0, it doesn't matter
if it's an SGPR because it's being replaced with the literal.

Also fixes initially selecting 32-bit versions of some instructions
which also confused commuting.

llvm-svn: 281117
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 2d5e9f4..e9d26a2 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -143,7 +143,7 @@
 ; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
 ; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
 ; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]]
-; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]]
+; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]
 ; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]]
 ; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc
 ; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 898acc3..1362fa7 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -82,8 +82,10 @@
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 ; R600: -PV
 
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+; FIXME: In this case two uses of the constant should be folded
+; SI: s_mov_b32 [[SIGNBITK:s[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
@@ -92,10 +94,11 @@
 }
 
 ; FUNC-LABEL: {{^}}fneg_fabs_v4f32:
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+; SI: s_mov_b32 [[SIGNBITK:s[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index d21d661..aa1f5b7 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -16,7 +16,7 @@
 ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
 ; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
-; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
+; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]]
 ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: s_endpgm
 define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
@@ -440,7 +440,7 @@
 ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
 ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
-; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]]
+; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
 ; GCN-DAG: buffer_store_dword [[PACKED]]
 ; GCN: s_endpgm
 define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index 4c559f4..7b1373b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -10,9 +10,7 @@
 ; VI: s_load_dword [[SRC:s[0-9]+]]
 ; VI-DAG: v_rsq_f32_e32 [[RSQ:v[0-9]+]], [[SRC]]
 ; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
-; TODO: this constant should be folded:
-; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff7fffff
-; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[K]]
+; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xff7fffff, [[MIN]]
 ; VI: buffer_store_dword [[RESULT]]
 define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 9b90ff7..56f54cf 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -113,11 +113,9 @@
 }
 
 ; FUNC-LABEL: {{^}}vector_or_i64_loadimm:
-; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
-; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
 define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
index 47c7fbb..e422270 100644
--- a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}s_movk_i32_k0:
@@ -11,6 +11,7 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 4295032831)
   ret void
 }
 
@@ -24,6 +25,7 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 4295000063)
   ret void
 }
 
@@ -37,6 +39,7 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 274877939711)
   ret void
 }
 
@@ -50,6 +53,7 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 4295000064)
   ret void
 }
 
@@ -63,6 +67,7 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 4295098368)
   ret void
 }
 
@@ -77,6 +82,7 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 18374967954648334319)
   ret void
 }
 
@@ -90,6 +96,7 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 270582939713)
   ret void
 }
 
@@ -104,10 +111,10 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 70368744185856)
   ret void
 }
 
-
 ; SI-LABEL: {{^}}s_movk_i32_k8:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
 ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
@@ -119,6 +126,7 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 1229782942255906816)
   ret void
 }
 
@@ -133,6 +141,7 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 1229782942255906817)
   ret void
 }
 
@@ -147,6 +156,7 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 1229782942255909000)
   ret void
 }
 
@@ -161,6 +171,7 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 1229782942255910911)
   ret void
 }
 
@@ -175,5 +186,6 @@
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
   store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 1229782942255902721)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/si-literal-folding.ll b/llvm/test/CodeGen/AMDGPU/si-literal-folding.ll
index d5030ad..b3f000c 100644
--- a/llvm/test/CodeGen/AMDGPU/si-literal-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-literal-folding.ll
@@ -1,9 +1,8 @@
-; XFAIL: *
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-; CHECK-LABEL: {{^}}main:
-; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0xbf4353f8
-
+; GCN-LABEL: {{^}}main:
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}}
 define amdgpu_vs void @main(float) {
 main_body:
   %1 = fmul float %0, 0x3FE86A7F00000000