[NVPTXFavorNonGenericAddrSpaces] recursively trace into GEP and BitCast

Summary:
This patch allows NVPTXFavorNonGenericAddrSpaces to remove addrspacecast
from longer chains consisting of GEPs and BitCasts. For example, it can
now optimize

  %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
  %1 = gep [10 x float]* %0, i64 0, i64 %i
  %2 = bitcast float* %1 to i32*
  %3 = load i32* %2 ; emits ld.u32

to

  %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
  %1 = bitcast float addrspace(3)* %0 to i32 addrspace(3)*
  %3 = load i32 addrspace(3)* %1 ; emits ld.shared.f32

Test Plan: @ld_int_from_global_float in access-non-generic.ll

Reviewers: broune, eliben, jholewinski, meheff

Subscribers: jholewinski, llvm-commits

Differential Revision: http://reviews.llvm.org/D10074

llvm-svn: 238574
diff --git a/llvm/test/CodeGen/NVPTX/access-non-generic.ll b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
index e709302..5deefe8 100644
--- a/llvm/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
@@ -85,6 +85,22 @@
   ret i32 %1
 }
 
+define i32 @ld_int_from_global_float(float addrspace(1)* %input, i32 %i, i32 %j) {
+; IR-LABEL: @ld_int_from_global_float(
+; PTX-LABEL: ld_int_from_global_float(
+  %1 = addrspacecast float addrspace(1)* %input to float*
+  %2 = getelementptr float, float* %1, i32 %i
+; IR-NEXT: getelementptr float, float addrspace(1)* %input, i32 %i
+  %3 = getelementptr float, float* %2, i32 %j
+; IR-NEXT: getelementptr float, float addrspace(1)* {{%[^,]+}}, i32 %j
+  %4 = bitcast float* %3 to i32*
+; IR-NEXT: bitcast float addrspace(1)* {{%[^ ]+}} to i32 addrspace(1)*
+  %5 = load i32, i32* %4
+; IR-NEXT: load i32, i32 addrspace(1)* {{%.+}}
+; PTX-LABEL: ld.global
+  ret i32 %5
+}
+
 declare void @llvm.cuda.syncthreads() #3
 
 attributes #3 = { noduplicate nounwind }