[NVPTXFavorNonGenericAddrSpaces] recursively trace into GEP and BitCast
Summary:
This patch allows NVPTXFavorNonGenericAddrSpaces to remove addrspacecast
from longer chains consisting of GEPs and BitCasts. For example, it can
now optimize
%0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
%1 = gep [10 x float]* %0, i64 0, i64 %i
%2 = bitcast float* %1 to i32*
%3 = load i32* %2 ; emits ld.u32
to
%0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
%1 = bitcast float addrspace(3)* %0 to i32 addrspace(3)*
%3 = load i32 addrspace(3)* %1 ; emits ld.shared.f32
Test Plan: @ld_int_from_global_float in access-non-generic.ll
Reviewers: broune, eliben, jholewinski, meheff
Subscribers: jholewinski, llvm-commits
Differential Revision: http://reviews.llvm.org/D10074
llvm-svn: 238574
diff --git a/llvm/test/CodeGen/NVPTX/access-non-generic.ll b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
index e709302..5deefe8 100644
--- a/llvm/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
@@ -85,6 +85,22 @@
ret i32 %1
}
+define i32 @ld_int_from_global_float(float addrspace(1)* %input, i32 %i, i32 %j) {
+; IR-LABEL: @ld_int_from_global_float(
+; PTX-LABEL: ld_int_from_global_float(
+ %1 = addrspacecast float addrspace(1)* %input to float*
+ %2 = getelementptr float, float* %1, i32 %i
+; IR-NEXT: getelementptr float, float addrspace(1)* %input, i32 %i
+ %3 = getelementptr float, float* %2, i32 %j
+; IR-NEXT: getelementptr float, float addrspace(1)* {{%[^,]+}}, i32 %j
+ %4 = bitcast float* %3 to i32*
+; IR-NEXT: bitcast float addrspace(1)* {{%[^ ]+}} to i32 addrspace(1)*
+ %5 = load i32, i32* %4
+; IR-NEXT: load i32, i32 addrspace(1)* {{%.+}}
+; PTX-LABEL: ld.global
+ ret i32 %5
+}
+
declare void @llvm.cuda.syncthreads() #3
attributes #3 = { noduplicate nounwind }