AMDGPU: Track physreg uses in SILoadStoreOptimizer
Summary:
This handles def-after-use of physregs, and allows us to merge loads and
stores even across some physreg defs (typically M0 defs).
Change-Id: I076484b2bda27c2cf46013c845a0380c5b89b67b
Reviewers: arsenm, mareko, rampitec
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D42647
llvm-svn: 325882
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index e72b6c1..f933dc0 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -629,6 +629,27 @@
ret void
}
+; GCN-LABEL: {{^}}ds_read_interp_read:
+; CI: s_mov_b32 m0, -1
+; CI: ds_read_b32
+; CI: s_mov_b32 m0, s0
+; CI: v_interp_mov_f32
+; CI: s_mov_b32 m0, -1
+; CI: ds_read_b32
+; GFX9: ds_read2_b32 v[0:1], v0 offset1:4
+; GFX9: s_mov_b32 m0, s0
+; GFX9: v_interp_mov_f32
+define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrspace(3)* %inptr) {
+ %v0 = load float, float addrspace(3)* %inptr, align 4
+ %intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims)
+ %ptr1 = getelementptr float, float addrspace(3)* %inptr, i32 4
+ %v1 = load float, float addrspace(3)* %ptr1, align 4
+ %v1b = fadd float %v1, %intrp
+ %r0 = insertelement <2 x float> undef, float %v0, i32 0
+ %r1 = insertelement <2 x float> %r0, float %v1b, i32 1
+ ret <2 x float> %r1
+}
+
declare void @void_func_void() #3
declare i32 @llvm.amdgcn.workgroup.id.x() #1
@@ -636,6 +657,8 @@
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare i32 @llvm.amdgcn.workitem.id.y() #1
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) nounwind readnone
+
declare void @llvm.amdgcn.s.barrier() #2
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir
new file mode 100644
index 0000000..f7002cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir
@@ -0,0 +1,116 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s
+
+# Check that SILoadStoreOptimizer honors physregs defs/uses between moved
+# instructions.
+#
+# The following IR snippet would usually be optimized by the peephole optimizer.
+# However, an equivalent situation can occur with buffer instructions as well.
+
+# CHECK-LABEL: name: scc_def_and_use_no_dependency
+# CHECK: S_ADD_U32
+# CHECK: S_ADDC_U32
+# CHECK: DS_READ2_B32
+--- |
+ define amdgpu_kernel void @scc_def_and_use_no_dependency(i32 addrspace(3)* %ptr.0) nounwind {
+ %ptr.4 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 1
+ %ptr.64 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 16
+ ret void
+ }
+
+ define amdgpu_kernel void @scc_def_and_use_dependency(i32 addrspace(3)* %ptr.0) nounwind {
+ %ptr.4 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 1
+ %ptr.64 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 16
+ ret void
+ }
+...
+---
+name: scc_def_and_use_no_dependency
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: false
+liveins:
+ - { reg: '$vgpr0' }
+ - { reg: '$sgpr0' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0
+
+ %1:vgpr_32 = COPY $vgpr0
+ %10:sgpr_32 = COPY $sgpr0
+
+ $m0 = S_MOV_B32 -1
+ %2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.0)
+
+ %11:sgpr_32 = S_ADD_U32 %10, 4, implicit-def $scc
+ %12:sgpr_32 = S_ADDC_U32 %10, 0, implicit-def dead $scc, implicit $scc
+
+ %3:vgpr_32 = DS_READ_B32 %1, 64, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.64)
+ S_ENDPGM
+
+...
+
+# CHECK-LABEL: name: scc_def_and_use_dependency
+# CHECK: DS_READ2_B32
+# CHECK: S_ADD_U32
+# CHECK: S_ADDC_U32
+---
+name: scc_def_and_use_dependency
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: false
+liveins:
+ - { reg: '$vgpr0' }
+ - { reg: '$sgpr0' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0
+
+ %1:vgpr_32 = COPY $vgpr0
+ %10:sgpr_32 = COPY $sgpr0
+
+ $m0 = S_MOV_B32 -1
+ %2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.0)
+ %20:sgpr_32 = V_READFIRSTLANE_B32 %2, implicit $exec
+
+ %21:sgpr_32 = S_ADD_U32 %20, 4, implicit-def $scc
+ ; The S_ADDC_U32 depends on the first DS_READ_B32 only via SCC
+ %11:sgpr_32 = S_ADDC_U32 %10, 0, implicit-def dead $scc, implicit $scc
+
+ %3:vgpr_32 = DS_READ_B32 %1, 64, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.64)
+ S_ENDPGM
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index 12190d5..8020b97 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SICI -check-prefix=SIVIGFX9 %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=SICI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI -check-prefix=SIVIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
; SMRD load with an immediate offset.
@@ -232,15 +232,24 @@
ret void
}
-; GCN-LABEL: {{^}}smrd_imm_nomerge_m0:
+; GCN-LABEL: {{^}}smrd_imm_merge_m0:
;
-; In principle we could merge the loads here as well, but it would require
-; careful tracking of physical registers since both v_interp* and v_movrel*
-; instructions (or gpr idx mode) use M0.
+; SICIVI: s_buffer_load_dwordx2
+; SICIVI: s_mov_b32 m0
+; SICIVI_DAG: v_interp_p1_f32
+; SICIVI_DAG: v_interp_p1_f32
+; SICIVI_DAG: v_interp_p1_f32
+; SICIVI_DAG: v_interp_p2_f32
+; SICIVI_DAG: v_interp_p2_f32
+; SICIVI_DAG: v_interp_p2_f32
+; SICIVI: s_mov_b32 m0
+; SICIVI: v_movrels_b32_e32
;
-; GCN: s_buffer_load_dword
-; GCN: s_buffer_load_dword
-define amdgpu_ps float @smrd_imm_nomerge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
+; Merging is still thwarted on GFX9 due to s_set_gpr_idx
+;
+; GFX9: s_buffer_load_dword
+; GFX9: s_buffer_load_dword
+define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
main_body:
%idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0)
%idx1 = bitcast float %idx1.f to i32