AMDGPU: Track physreg uses in SILoadStoreOptimizer
Summary:
This handles def-after-use of physregs, and allows us to merge loads and
stores even across some physreg defs (typically M0 defs).
Change-Id: I076484b2bda27c2cf46013c845a0380c5b89b67b
Reviewers: arsenm, mareko, rampitec
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D42647
llvm-svn: 325882
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index e72b6c1..f933dc0 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -629,6 +629,27 @@
ret void
}
+; GCN-LABEL: {{^}}ds_read_interp_read:
+; CI: s_mov_b32 m0, -1
+; CI: ds_read_b32
+; CI: s_mov_b32 m0, s0
+; CI: v_interp_mov_f32
+; CI: s_mov_b32 m0, -1
+; CI: ds_read_b32
+; GFX9: ds_read2_b32 v[0:1], v0 offset1:4
+; GFX9: s_mov_b32 m0, s0
+; GFX9: v_interp_mov_f32
+define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrspace(3)* %inptr) {
+ %v0 = load float, float addrspace(3)* %inptr, align 4
+ %intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims)
+ %ptr1 = getelementptr float, float addrspace(3)* %inptr, i32 4
+ %v1 = load float, float addrspace(3)* %ptr1, align 4
+ %v1b = fadd float %v1, %intrp
+ %r0 = insertelement <2 x float> undef, float %v0, i32 0
+ %r1 = insertelement <2 x float> %r0, float %v1b, i32 1
+ ret <2 x float> %r1
+}
+
declare void @void_func_void() #3
declare i32 @llvm.amdgcn.workgroup.id.x() #1
@@ -636,6 +657,8 @@
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare i32 @llvm.amdgcn.workitem.id.y() #1
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) nounwind readnone
+
declare void @llvm.amdgcn.s.barrier() #2
attributes #0 = { nounwind }