AMDGPU/SI: Use v_readfirstlane to legalize SMRD with VGPR base pointer
Summary:
Instead of trying to replace SMRD instructions with a VGPR base pointer
with an equivalent MUBUF instruction, we now copy the base pointer to
SGPRs using v_readfirstlane.
This is safe to do, because any load selected as an SMRD instruction
has been proven to have a uniform base pointer, so each thread in the
wave will have the same pointer value in VGPRs.
This will fix some errors on VI from trying to replace SMRD instructions
with addr64-enabled MUBUF instructions that don't exist.
Reviewers: arsenm, cfang, nhaehnle
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D17305
llvm-svn: 261385
diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll
index 4af9cdf..c919b3b 100644
--- a/llvm/test/CodeGen/AMDGPU/missing-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll
@@ -8,7 +8,9 @@
 ; FUNC-LABEL: {{^}}missing_store_reduced:
 ; SI: ds_read_b64
 ; SI: buffer_store_dword
-; SI: buffer_load_dword
+; SI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
index eebd06e..abd6b7a 100644
--- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -53,10 +53,14 @@
 ; Test moving an SMRD instruction to the VALU
 
 ; GCN-LABEL: {{^}}smrd_valu:
-; FIXME: We should be using flat load for HSA.
-; GCN: buffer_load_dword [[OUT:v[0-9]+]]
-; GCN-NOHSA: buffer_store_dword [[OUT]]
-; GCN-HSA: flat_store_dword {{.*}}, [[OUT]]
+; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
+; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
+; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
+; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
+; GCN-NOHSA: buffer_store_dword [[V_OUT]]
+; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
 define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
 entry:
   %tmp = icmp ne i32 %a, 0
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 416f396..b1277da 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -70,15 +70,14 @@
   ret void
 }
 
-; Technically we could reorder these, but just comparing the
-; instruction type of the load is insufficient.
-
-; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load
-; CI: buffer_load_dword
+; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
 ; CI: buffer_store_dword
-; CI: buffer_load_dword
+; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2
 ; CI: buffer_store_dword
-define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
@@ -95,8 +94,10 @@
 }
 
 ; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
-; CI: buffer_load_dword
-; CI: buffer_load_dword
+; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
 define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {