[opaque pointer type] Add textual IR support for explicit type parameter to load instruction

Essentially the same as the GEP change in r230786.

A similar migration script can be used to update test cases, though a few more
test case improvements/changes were required this time around: (r229269-r229278)

import fileinput
import sys
import re

pat = re.compile(r"((?:=|:|^)\s*load (?:atomic )?(?:volatile )?(.*?))(| addrspace\(\d+\) *)\*($| *(?:%|@|null|undef|blockaddress|getelementptr|addrspacecast|bitcast|inttoptr|\[\[[a-zA-Z]|\{\{).*$)")

for line in sys.stdin:
  sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line))

Reviewers: rafael, dexonsmith, grosser

Differential Revision: http://reviews.llvm.org/D7649

llvm-svn: 230794
diff --git a/llvm/test/CodeGen/NVPTX/access-non-generic.ll b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
index e779c9e..f3ff93f 100644
--- a/llvm/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
@@ -18,7 +18,7 @@
 ; IR-NOT: addrspacecast
 ; PTX-LABEL: ld_st_shared_f32(
   ; load cast
-  %1 = load float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
+  %1 = load float, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
   ; store cast
   store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
@@ -29,7 +29,7 @@
 
   ; cast; load
   %2 = addrspacecast float addrspace(3)* @scalar to float*
-  %3 = load float* %2, align 4
+  %3 = load float, float* %2, align 4
 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
   ; cast; store
   store float %v, float* %2, align 4
@@ -38,7 +38,7 @@
 ; PTX: bar.sync 0;
 
   ; load gep cast
-  %4 = load float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
+  %4 = load float, float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
   ; store gep cast
   store float %v, float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
@@ -48,7 +48,7 @@
 
   ; gep cast; load
   %5 = getelementptr inbounds [10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5
-  %6 = load float* %5, align 4
+  %6 = load float, float* %5, align 4
 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
   ; gep cast; store
   store float %v, float* %5, align 4
@@ -59,7 +59,7 @@
   ; cast; gep; load
   %7 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float]*
   %8 = getelementptr inbounds [10 x float], [10 x float]* %7, i32 0, i32 %i
-  %9 = load float* %8, align 4
+  %9 = load float, float* %8, align 4
 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
   ; cast; gep; store
   store float %v, float* %8, align 4
@@ -78,10 +78,10 @@
 ; addrspacecast with a bitcast.
 define i32 @ld_int_from_float() {
 ; IR-LABEL: @ld_int_from_float
-; IR: load i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*)
+; IR: load i32, i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*)
 ; PTX-LABEL: ld_int_from_float(
 ; PTX: ld.shared.u{{(32|64)}}
-  %1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4
+  %1 = load i32, i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4
   ret i32 %1
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
index 03b9a98..42e67ca 100644
--- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll
+++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
@@ -10,7 +10,7 @@
 ; PTX64: cvta.global.u64
 ; PTX64: ld.u32
   %genptr = addrspacecast i32 addrspace(1)* %ptr to i32*
-  %val = load i32* %genptr
+  %val = load i32, i32* %genptr
   ret i32 %val
 }
 
@@ -22,7 +22,7 @@
 ; PTX64: cvta.shared.u64
 ; PTX64: ld.u32
   %genptr = addrspacecast i32 addrspace(3)* %ptr to i32*
-  %val = load i32* %genptr
+  %val = load i32, i32* %genptr
   ret i32 %val
 }
 
@@ -34,7 +34,7 @@
 ; PTX64: cvta.const.u64
 ; PTX64: ld.u32
   %genptr = addrspacecast i32 addrspace(4)* %ptr to i32*
-  %val = load i32* %genptr
+  %val = load i32, i32* %genptr
   ret i32 %val
 }
 
@@ -46,7 +46,7 @@
 ; PTX64: cvta.local.u64
 ; PTX64: ld.u32
   %genptr = addrspacecast i32 addrspace(5)* %ptr to i32*
-  %val = load i32* %genptr
+  %val = load i32, i32* %genptr
   ret i32 %val
 }
 
@@ -58,7 +58,7 @@
 ; PTX64: cvta.to.global.u64
 ; PTX64: ld.global.u32
   %specptr = addrspacecast i32* %ptr to i32 addrspace(1)*
-  %val = load i32 addrspace(1)* %specptr
+  %val = load i32, i32 addrspace(1)* %specptr
   ret i32 %val
 }
 
@@ -70,7 +70,7 @@
 ; PTX64: cvta.to.shared.u64
 ; PTX64: ld.shared.u32
   %specptr = addrspacecast i32* %ptr to i32 addrspace(3)*
-  %val = load i32 addrspace(3)* %specptr
+  %val = load i32, i32 addrspace(3)* %specptr
   ret i32 %val
 }
 
@@ -82,7 +82,7 @@
 ; PTX64: cvta.to.const.u64
 ; PTX64: ld.const.u32
   %specptr = addrspacecast i32* %ptr to i32 addrspace(4)*
-  %val = load i32 addrspace(4)* %specptr
+  %val = load i32, i32 addrspace(4)* %specptr
   ret i32 %val
 }
 
@@ -94,6 +94,6 @@
 ; PTX64: cvta.to.local.u64
 ; PTX64: ld.local.u32
   %specptr = addrspacecast i32* %ptr to i32 addrspace(5)*
-  %val = load i32 addrspace(5)* %specptr
+  %val = load i32, i32 addrspace(5)* %specptr
   ret i32 %val
 }
diff --git a/llvm/test/CodeGen/NVPTX/bug21465.ll b/llvm/test/CodeGen/NVPTX/bug21465.ll
index 4a1588b..76af386 100644
--- a/llvm/test/CodeGen/NVPTX/bug21465.ll
+++ b/llvm/test/CodeGen/NVPTX/bug21465.ll
@@ -12,7 +12,7 @@
 ; CHECK:   bitcast %struct.S* %input to i8*
 ; CHECK:   call i8 addrspace(101)* @llvm.nvvm.ptr.gen.to.param.p101i8.p0i8
   %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
-  %0 = load i32* %b, align 4
+  %0 = load i32, i32* %b, align 4
   store i32 %0, i32* %output, align 4
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/bug22322.ll b/llvm/test/CodeGen/NVPTX/bug22322.ll
index 713c34f..97863b9 100644
--- a/llvm/test/CodeGen/NVPTX/bug22322.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22322.ll
@@ -24,7 +24,7 @@
   store float %9, float* %ret_vec.sroa.8.i, align 4
 ; CHECK: setp.lt.f32     %p{{[0-9]+}}, %f{{[0-9]+}}, 0f00000000
   %10 = fcmp olt float %9, 0.000000e+00
-  %ret_vec.sroa.8.i.val = load float* %ret_vec.sroa.8.i, align 4
+  %ret_vec.sroa.8.i.val = load float, float* %ret_vec.sroa.8.i, align 4
   %11 = select i1 %10, float 0.000000e+00, float %ret_vec.sroa.8.i.val
   call void @llvm.lifetime.end(i64 4, i8* %6)
   %12 = getelementptr inbounds %class.float3, %class.float3* %dst, i64 %5, i32 0
diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 2fc36be..58b1911 100644
--- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -27,21 +27,21 @@
 ; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rd[[A_REG]]]
 ; CHECK: st.f32 [%SP+0], %f[[A0_REG]]
 
-  %0 = load float* %a, align 4
+  %0 = load float, float* %a, align 4
   %1 = bitcast [16 x i8]* %buf to float*
   store float %0, float* %1, align 4
   %arrayidx2 = getelementptr inbounds float, float* %a, i64 1
-  %2 = load float* %arrayidx2, align 4
+  %2 = load float, float* %arrayidx2, align 4
   %arrayidx3 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i64 0, i64 1
   %3 = bitcast i8* %arrayidx3 to float*
   store float %2, float* %3, align 4
   %arrayidx4 = getelementptr inbounds float, float* %a, i64 2
-  %4 = load float* %arrayidx4, align 4
+  %4 = load float, float* %arrayidx4, align 4
   %arrayidx5 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i64 0, i64 2
   %5 = bitcast i8* %arrayidx5 to float*
   store float %4, float* %5, align 4
   %arrayidx6 = getelementptr inbounds float, float* %a, i64 3
-  %6 = load float* %arrayidx6, align 4
+  %6 = load float, float* %arrayidx6, align 4
   %arrayidx7 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i64 0, i64 3
   %7 = bitcast i8* %arrayidx7 to float*
   store float %6, float* %7, align 4
diff --git a/llvm/test/CodeGen/NVPTX/fp16.ll b/llvm/test/CodeGen/NVPTX/fp16.ll
index 8770399..b85eed0 100644
--- a/llvm/test/CodeGen/NVPTX/fp16.ll
+++ b/llvm/test/CodeGen/NVPTX/fp16.ll
@@ -8,7 +8,7 @@
 ; CHECK-LABEL: @test_convert_fp16_to_fp32
 ; CHECK: cvt.f32.f16
 define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16 addrspace(1)* %in, align 2
+  %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
   store float %cvt, float addrspace(1)* %out, align 4
   ret void
@@ -18,7 +18,7 @@
 ; CHECK-LABEL: @test_convert_fp16_to_fp64
 ; CHECK: cvt.f64.f16
 define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16 addrspace(1)* %in, align 2
+  %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
   store double %cvt, double addrspace(1)* %out, align 4
   ret void
@@ -28,7 +28,7 @@
 ; CHECK-LABEL: @test_convert_fp32_to_fp16
 ; CHECK: cvt.rn.f16.f32
 define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %val = load float addrspace(1)* %in, align 2
+  %val = load float, float addrspace(1)* %in, align 2
   %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
   store i16 %cvt, i16 addrspace(1)* %out, align 4
   ret void
@@ -38,7 +38,7 @@
 ; CHECK-LABEL: @test_convert_fp64_to_fp16
 ; CHECK: cvt.rn.f16.f64
 define void @test_convert_fp64_to_fp16(i16 addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
-  %val = load double addrspace(1)* %in, align 2
+  %val = load double, double addrspace(1)* %in, align 2
   %cvt = call i16 @llvm.convert.to.fp16.f64(double %val) nounwind readnone
   store i16 %cvt, i16 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
index fb63d6e..66917d5 100644
--- a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
+++ b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
@@ -13,9 +13,9 @@
 
 define void @foo(i32* %a, i32* %b) {
 ; CHECK: cvta.global.u32
-  %ld1 = load i32* @myglobal
+  %ld1 = load i32, i32* @myglobal
 ; CHECK: cvta.global.u32
-  %ld2 = load i32* @myconst
+  %ld2 = load i32, i32* @myconst
   store i32 %ld1, i32* %a
   store i32 %ld2, i32* %b
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/half.ll b/llvm/test/CodeGen/NVPTX/half.ll
index aa08cc7..b995241 100644
--- a/llvm/test/CodeGen/NVPTX/half.ll
+++ b/llvm/test/CodeGen/NVPTX/half.ll
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: @test_load_store
 ; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
 ; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
-  %val = load half addrspace(1)* %in
+  %val = load half, half addrspace(1)* %in
   store half %val, half addrspace(1) * %out
   ret void
 }
@@ -13,7 +13,7 @@
 ; CHECK-LABEL: @test_bitcast_from_half
 ; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
 ; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
-  %val = load half addrspace(1) * %in
+  %val = load half, half addrspace(1) * %in
   %val_int = bitcast half %val to i16
   store i16 %val_int, i16 addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@
 ; CHECK-LABEL: @test_bitcast_to_half
 ; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
 ; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
-  %val = load i16 addrspace(1)* %in
+  %val = load i16, i16 addrspace(1)* %in
   %val_fp = bitcast i16 %val to half
   store half %val_fp, half addrspace(1)* %out
   ret void
@@ -33,7 +33,7 @@
 ; CHECK-LABEL: @test_extend32
 ; CHECK: cvt.f32.f16
 
-  %val16 = load half addrspace(1)* %in
+  %val16 = load half, half addrspace(1)* %in
   %val32 = fpext half %val16 to float
   store float %val32, float addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@
 ; CHECK-LABEL: @test_extend64
 ; CHECK: cvt.f64.f16
 
-  %val16 = load half addrspace(1)* %in
+  %val16 = load half, half addrspace(1)* %in
   %val64 = fpext half %val16 to double
   store double %val64, double addrspace(1)* %out
   ret void
@@ -53,7 +53,7 @@
 ; CHECK-LABEL: test_trunc32
 ; CHECK: cvt.rn.f16.f32
 
-  %val32 = load float addrspace(1)* %in
+  %val32 = load float, float addrspace(1)* %in
   %val16 = fptrunc float %val32 to half
   store half %val16, half addrspace(1)* %out
   ret void
@@ -63,7 +63,7 @@
 ; CHECK-LABEL: @test_trunc64
 ; CHECK: cvt.rn.f16.f64
 
-  %val32 = load double addrspace(1)* %in
+  %val32 = load double, double addrspace(1)* %in
   %val16 = fptrunc double %val32 to half
   store half %val16, half addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/i1-global.ll b/llvm/test/CodeGen/NVPTX/i1-global.ll
index e3fe08e..35d77b4 100644
--- a/llvm/test/CodeGen/NVPTX/i1-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-global.ll
@@ -8,7 +8,7 @@
 
 
 define void @foo(i1 %p, i32* %out) {
-  %ld = load i1 addrspace(1)* @mypred
+  %ld = load i1, i1 addrspace(1)* @mypred
   %val = zext i1 %ld to i32
   store i32 %val, i32* %out
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/i8-param.ll b/llvm/test/CodeGen/NVPTX/i8-param.ll
index 84daa9f..6a1e3a0 100644
--- a/llvm/test/CodeGen/NVPTX/i8-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i8-param.ll
@@ -13,7 +13,7 @@
 ; CHECK: .visible .func caller
 define void @caller(i8* %a) {
 ; CHECK: ld.u8
-  %val = load i8* %a
+  %val = load i8, i8* %a
   %ret = tail call i8 @callee(i8 %val)
 ; CHECK: ld.param.b32
   store i8 %ret, i8* %a
diff --git a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll
index f33659c..0018e61 100644
--- a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll
+++ b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll
@@ -8,7 +8,7 @@
 ; PTX32: ret
 ; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i8 addrspace(1)* %ptr
+  %a = load i8, i8 addrspace(1)* %ptr
   ret i8 %a
 }
 
@@ -17,7 +17,7 @@
 ; PTX32: ret
 ; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i8 addrspace(3)* %ptr
+  %a = load i8, i8 addrspace(3)* %ptr
   ret i8 %a
 }
 
@@ -26,7 +26,7 @@
 ; PTX32: ret
 ; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i8 addrspace(5)* %ptr
+  %a = load i8, i8 addrspace(5)* %ptr
   ret i8 %a
 }
 
@@ -36,7 +36,7 @@
 ; PTX32: ret
 ; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i16 addrspace(1)* %ptr
+  %a = load i16, i16 addrspace(1)* %ptr
   ret i16 %a
 }
 
@@ -45,7 +45,7 @@
 ; PTX32: ret
 ; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i16 addrspace(3)* %ptr
+  %a = load i16, i16 addrspace(3)* %ptr
   ret i16 %a
 }
 
@@ -54,7 +54,7 @@
 ; PTX32: ret
 ; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i16 addrspace(5)* %ptr
+  %a = load i16, i16 addrspace(5)* %ptr
   ret i16 %a
 }
 
@@ -64,7 +64,7 @@
 ; PTX32: ret
 ; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i32 addrspace(1)* %ptr
+  %a = load i32, i32 addrspace(1)* %ptr
   ret i32 %a
 }
 
@@ -73,7 +73,7 @@
 ; PTX32: ret
 ; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i32 addrspace(3)* %ptr
+  %a = load i32, i32 addrspace(3)* %ptr
   ret i32 %a
 }
 
@@ -82,7 +82,7 @@
 ; PTX32: ret
 ; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i32 addrspace(5)* %ptr
+  %a = load i32, i32 addrspace(5)* %ptr
   ret i32 %a
 }
 
@@ -92,7 +92,7 @@
 ; PTX32: ret
 ; PTX64: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i64 addrspace(1)* %ptr
+  %a = load i64, i64 addrspace(1)* %ptr
   ret i64 %a
 }
 
@@ -101,7 +101,7 @@
 ; PTX32: ret
 ; PTX64: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i64 addrspace(3)* %ptr
+  %a = load i64, i64 addrspace(3)* %ptr
   ret i64 %a
 }
 
@@ -110,7 +110,7 @@
 ; PTX32: ret
 ; PTX64: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i64 addrspace(5)* %ptr
+  %a = load i64, i64 addrspace(5)* %ptr
   ret i64 %a
 }
 
@@ -120,7 +120,7 @@
 ; PTX32: ret
 ; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load float addrspace(1)* %ptr
+  %a = load float, float addrspace(1)* %ptr
   ret float %a
 }
 
@@ -129,7 +129,7 @@
 ; PTX32: ret
 ; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load float addrspace(3)* %ptr
+  %a = load float, float addrspace(3)* %ptr
   ret float %a
 }
 
@@ -138,7 +138,7 @@
 ; PTX32: ret
 ; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load float addrspace(5)* %ptr
+  %a = load float, float addrspace(5)* %ptr
   ret float %a
 }
 
@@ -148,7 +148,7 @@
 ; PTX32: ret
 ; PTX64: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load double addrspace(1)* %ptr
+  %a = load double, double addrspace(1)* %ptr
   ret double %a
 }
 
@@ -157,7 +157,7 @@
 ; PTX32: ret
 ; PTX64: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load double addrspace(3)* %ptr
+  %a = load double, double addrspace(3)* %ptr
   ret double %a
 }
 
@@ -166,6 +166,6 @@
 ; PTX32: ret
 ; PTX64: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load double addrspace(5)* %ptr
+  %a = load double, double addrspace(5)* %ptr
   ret double %a
 }
diff --git a/llvm/test/CodeGen/NVPTX/ld-generic.ll b/llvm/test/CodeGen/NVPTX/ld-generic.ll
index d629e0e..44cfe65 100644
--- a/llvm/test/CodeGen/NVPTX/ld-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/ld-generic.ll
@@ -8,7 +8,7 @@
 ; PTX32: ret
 ; PTX64: ld.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i8 addrspace(0)* %ptr
+  %a = load i8, i8 addrspace(0)* %ptr
   ret i8 %a
 }
 
@@ -18,7 +18,7 @@
 ; PTX32: ret
 ; PTX64: ld.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i16 addrspace(0)* %ptr
+  %a = load i16, i16 addrspace(0)* %ptr
   ret i16 %a
 }
 
@@ -28,7 +28,7 @@
 ; PTX32: ret
 ; PTX64: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i32 addrspace(0)* %ptr
+  %a = load i32, i32 addrspace(0)* %ptr
   ret i32 %a
 }
 
@@ -38,7 +38,7 @@
 ; PTX32: ret
 ; PTX64: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i64 addrspace(0)* %ptr
+  %a = load i64, i64 addrspace(0)* %ptr
   ret i64 %a
 }
 
@@ -48,7 +48,7 @@
 ; PTX32: ret
 ; PTX64: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load float addrspace(0)* %ptr
+  %a = load float, float addrspace(0)* %ptr
   ret float %a
 }
 
@@ -58,6 +58,6 @@
 ; PTX32: ret
 ; PTX64: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load double addrspace(0)* %ptr
+  %a = load double, double addrspace(0)* %ptr
   ret double %a
 }
diff --git a/llvm/test/CodeGen/NVPTX/load-sext-i1.ll b/llvm/test/CodeGen/NVPTX/load-sext-i1.ll
index 6dc7bd8..9fc98a4 100644
--- a/llvm/test/CodeGen/NVPTX/load-sext-i1.ll
+++ b/llvm/test/CodeGen/NVPTX/load-sext-i1.ll
@@ -7,7 +7,7 @@
 ; CHECK: ld.u8
 ; CHECK-NOT: ld.u1
   %t1 = getelementptr i1, i1* %a1, i32 %a2
-  %t2 = load i1* %t1
+  %t2 = load i1, i1* %t1
   %t3 = sext i1 %t2 to i32
   store i32 %t3, i32* %arg3
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/machine-sink.ll b/llvm/test/CodeGen/NVPTX/machine-sink.ll
index 3614bea..65ba141 100644
--- a/llvm/test/CodeGen/NVPTX/machine-sink.ll
+++ b/llvm/test/CodeGen/NVPTX/machine-sink.ll
@@ -14,8 +14,8 @@
 define float @post_dominate(float %x, i1 %cond) {
 ; CHECK-LABEL: post_dominate(
 entry:
-  %0 = load float* addrspacecast (float addrspace(3)* @scalar1 to float*), align 4
-  %1 = load float* addrspacecast (float addrspace(3)* @scalar2 to float*), align 4
+  %0 = load float, float* addrspacecast (float addrspace(3)* @scalar1 to float*), align 4
+  %1 = load float, float* addrspacecast (float addrspace(3)* @scalar2 to float*), align 4
 ; CHECK: ld.shared.f32
 ; CHECK: ld.shared.f32
   %2 = fmul float %0, %0
diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
index 90c9c43..2ad72b0 100644
--- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
+++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
@@ -10,7 +10,7 @@
 ; CHECK-NOT: ld.f32
 ; CHECK: ld.u8
   %cast = bitcast i8* %p1 to <4 x float>*
-  %r = load <4 x float>* %cast, align 1
+  %r = load <4 x float>, <4 x float>* %cast, align 1
   ret <4 x float> %r
 }
 
@@ -20,7 +20,7 @@
 ; CHECK-NOT: ld.v2
 ; CHECK: ld.f32
   %cast = bitcast i8* %p1 to <4 x float>*
-  %r = load <4 x float>* %cast, align 4
+  %r = load <4 x float>, <4 x float>* %cast, align 4
   ret <4 x float> %r
 }
 
@@ -29,7 +29,7 @@
 ; CHECK-NOT: ld.v4
 ; CHECK: ld.v2
   %cast = bitcast i8* %p1 to <4 x float>*
-  %r = load <4 x float>* %cast, align 8
+  %r = load <4 x float>, <4 x float>* %cast, align 8
   ret <4 x float> %r
 }
 
@@ -37,7 +37,7 @@
 define <4 x float> @t4(i8* %p1) {
 ; CHECK: ld.v4
   %cast = bitcast i8* %p1 to <4 x float>*
-  %r = load <4 x float>* %cast, align 16
+  %r = load <4 x float>, <4 x float>* %cast, align 16
   ret <4 x float> %r
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll b/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
index fcb88ea..2fec31b 100644
--- a/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
+++ b/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
@@ -11,16 +11,16 @@
 entry:
   %output.addr = alloca float*, align 8
   store float* %output, float** %output.addr, align 8
-  %0 = load float** %output.addr, align 8
+  %0 = load float*, float** %output.addr, align 8
   %arrayidx = getelementptr inbounds float, float* %0, i64 0
-  %1 = load float* %arrayidx, align 4
+  %1 = load float, float* %arrayidx, align 4
   %conv = fpext float %1 to double
   %cmp = fcmp olt double %conv, 1.000000e+01
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %2 = load float** %output.addr, align 8
-  %3 = load float* %2, align 4
+  %2 = load float*, float** %output.addr, align 8
+  %3 = load float, float* %2, align 4
   %conv1 = fpext float %3 to double
   %add = fadd double %conv1, 1.000000e+00
   %conv2 = fptrunc double %add to float
@@ -28,8 +28,8 @@
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  %4 = load float** %output.addr, align 8
-  %5 = load float* %4, align 4
+  %4 = load float*, float** %output.addr, align 8
+  %5 = load float, float* %4, align 4
   %conv3 = fpext float %5 to double
   %add4 = fadd double %conv3, 2.000000e+00
   %conv5 = fptrunc double %add4 to float
@@ -38,16 +38,16 @@
 
 if.end:                                           ; preds = %if.else, %if.then
   call void @llvm.cuda.syncthreads()
-  %6 = load float** %output.addr, align 8
+  %6 = load float*, float** %output.addr, align 8
   %arrayidx6 = getelementptr inbounds float, float* %6, i64 0
-  %7 = load float* %arrayidx6, align 4
+  %7 = load float, float* %arrayidx6, align 4
   %conv7 = fpext float %7 to double
   %cmp8 = fcmp olt double %conv7, 1.000000e+01
   br i1 %cmp8, label %if.then9, label %if.else13
 
 if.then9:                                         ; preds = %if.end
-  %8 = load float** %output.addr, align 8
-  %9 = load float* %8, align 4
+  %8 = load float*, float** %output.addr, align 8
+  %9 = load float, float* %8, align 4
   %conv10 = fpext float %9 to double
   %add11 = fadd double %conv10, 3.000000e+00
   %conv12 = fptrunc double %add11 to float
@@ -55,8 +55,8 @@
   br label %if.end17
 
 if.else13:                                        ; preds = %if.end
-  %10 = load float** %output.addr, align 8
-  %11 = load float* %10, align 4
+  %10 = load float*, float** %output.addr, align 8
+  %11 = load float, float* %10, align 4
   %conv14 = fpext float %11 to double
   %add15 = fadd double %conv14, 4.000000e+00
   %conv16 = fptrunc double %add15 to float
diff --git a/llvm/test/CodeGen/NVPTX/nounroll.ll b/llvm/test/CodeGen/NVPTX/nounroll.ll
index 3e606f5..e80a4a2 100644
--- a/llvm/test/CodeGen/NVPTX/nounroll.ll
+++ b/llvm/test/CodeGen/NVPTX/nounroll.ll
@@ -18,7 +18,7 @@
   %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   %idxprom = sext i32 %i.06 to i64
   %arrayidx = getelementptr inbounds float, float* %input, i64 %idxprom
-  %0 = load float* %arrayidx, align 4
+  %0 = load float, float* %arrayidx, align 4
 ; CHECK: ld.f32
   %arrayidx2 = getelementptr inbounds float, float* %output, i64 %idxprom
   store float %0, float* %arrayidx2, align 4
diff --git a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
index cc67a6f..d4f7c3b 100644
--- a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
+++ b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -19,7 +19,7 @@
 ; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 ; PTX64: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 
-  %t1 = load i1* %a
+  %t1 = load i1, i1* %a
   %t2 = select i1 %t1, i8 1, i8 2
   store i8 %t2, i8* %b
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/pr16278.ll b/llvm/test/CodeGen/NVPTX/pr16278.ll
index 5432a84..a836eaf 100644
--- a/llvm/test/CodeGen/NVPTX/pr16278.ll
+++ b/llvm/test/CodeGen/NVPTX/pr16278.ll
@@ -5,6 +5,6 @@
 
 define float @foo() {
 ; CHECK: ld.const.f32
-  %val = load float addrspace(4)* @one_f
+  %val = load float, float addrspace(4)* @one_f
   ret float %val
 }
diff --git a/llvm/test/CodeGen/NVPTX/refl1.ll b/llvm/test/CodeGen/NVPTX/refl1.ll
index e8782ea..0432b67 100644
--- a/llvm/test/CodeGen/NVPTX/refl1.ll
+++ b/llvm/test/CodeGen/NVPTX/refl1.ll
@@ -5,7 +5,7 @@
 ; Function Attrs: nounwind
 ; CHECK: .entry foo
 define void @foo(float* nocapture %a) #0 {
-  %val = load float* %a
+  %val = load float, float* %a
   %tan = tail call fastcc float @__nv_fast_tanf(float %val)
   store float %tan, float* %a
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/sched1.ll b/llvm/test/CodeGen/NVPTX/sched1.ll
index eb0f8ce..fb01eb2 100644
--- a/llvm/test/CodeGen/NVPTX/sched1.ll
+++ b/llvm/test/CodeGen/NVPTX/sched1.ll
@@ -12,13 +12,13 @@
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
   %ptr0 = getelementptr i32, i32* %a, i32 0
-  %val0 = load i32* %ptr0
+  %val0 = load i32, i32* %ptr0
   %ptr1 = getelementptr i32, i32* %a, i32 1
-  %val1 = load i32* %ptr1
+  %val1 = load i32, i32* %ptr1
   %ptr2 = getelementptr i32, i32* %a, i32 2
-  %val2 = load i32* %ptr2
+  %val2 = load i32, i32* %ptr2
   %ptr3 = getelementptr i32, i32* %a, i32 3
-  %val3 = load i32* %ptr3
+  %val3 = load i32, i32* %ptr3
 
   %t0 = add i32 %val0, %val1
   %t1 = add i32 %t0, %val2
diff --git a/llvm/test/CodeGen/NVPTX/sched2.ll b/llvm/test/CodeGen/NVPTX/sched2.ll
index 4d7f00e..91ed778 100644
--- a/llvm/test/CodeGen/NVPTX/sched2.ll
+++ b/llvm/test/CodeGen/NVPTX/sched2.ll
@@ -13,13 +13,13 @@
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
   %ptr0 = getelementptr <2 x i32>, <2 x i32>* %a, i32 0
-  %val0 = load <2 x i32>* %ptr0
+  %val0 = load <2 x i32>, <2 x i32>* %ptr0
   %ptr1 = getelementptr <2 x i32>, <2 x i32>* %a, i32 1
-  %val1 = load <2 x i32>* %ptr1
+  %val1 = load <2 x i32>, <2 x i32>* %ptr1
   %ptr2 = getelementptr <2 x i32>, <2 x i32>* %a, i32 2
-  %val2 = load <2 x i32>* %ptr2
+  %val2 = load <2 x i32>, <2 x i32>* %ptr2
   %ptr3 = getelementptr <2 x i32>, <2 x i32>* %a, i32 3
-  %val3 = load <2 x i32>* %ptr3
+  %val3 = load <2 x i32>, <2 x i32>* %ptr3
 
   %t0 = add <2 x i32> %val0, %val1
   %t1 = add <2 x i32> %t0, %val2
diff --git a/llvm/test/CodeGen/NVPTX/shift-parts.ll b/llvm/test/CodeGen/NVPTX/shift-parts.ll
index 748297c..b4d408f 100644
--- a/llvm/test/CodeGen/NVPTX/shift-parts.ll
+++ b/llvm/test/CodeGen/NVPTX/shift-parts.ll
@@ -12,8 +12,8 @@
 ; CHECK: setp.gt.s32
 ; CHECK: selp.b64
 ; CHECK: shl.b64
-  %amt = load i128* %amtptr
-  %a = load i128* %val
+  %amt = load i128, i128* %amtptr
+  %a = load i128, i128* %val
   %val0 = shl i128 %a, %amt
   store i128 %val0, i128* %val
   ret void
@@ -30,8 +30,8 @@
 ; CHECK: setp.gt.s32
 ; CHECK: selp.b64
 ; CHECK: shr.s64
-  %amt = load i128* %amtptr
-  %a = load i128* %val
+  %amt = load i128, i128* %amtptr
+  %a = load i128, i128* %val
   %val0 = ashr i128 %a, %amt
   store i128 %val0, i128* %val
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/simple-call.ll b/llvm/test/CodeGen/NVPTX/simple-call.ll
index 1b41361..da65686 100644
--- a/llvm/test/CodeGen/NVPTX/simple-call.ll
+++ b/llvm/test/CodeGen/NVPTX/simple-call.ll
@@ -11,7 +11,7 @@
 
 ; CHECK: .entry kernel_func
 define void @kernel_func(float* %a) {
-  %val = load float* %a
+  %val = load float, float* %a
 ; CHECK: call.uni (retval0),
 ; CHECK: device_func,
   %mul = call float @device_func(float %val)
diff --git a/llvm/test/CodeGen/NVPTX/vector-compare.ll b/llvm/test/CodeGen/NVPTX/vector-compare.ll
index 2180499..2992b0e 100644
--- a/llvm/test/CodeGen/NVPTX/vector-compare.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-compare.ll
@@ -6,8 +6,8 @@
 ; tried to promote <2 x i1> to <2 x i8> and instruction selection failed.
 
 define void @foo(<2 x i32>* %a, <2 x i32>* %b, i32* %r1, i32* %r2) {
-  %aval = load <2 x i32>* %a
-  %bval = load <2 x i32>* %b
+  %aval = load <2 x i32>, <2 x i32>* %a
+  %bval = load <2 x i32>, <2 x i32>* %b
   %res = icmp slt <2 x i32> %aval, %bval
   %t1 = extractelement <2 x i1> %res, i32 0
   %t2 = extractelement <2 x i1> %res, i32 1
diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index 58882bf..d703489 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -10,7 +10,7 @@
 define void @foo(<2 x float>* %a) {
 ; CHECK: .func foo
 ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
-  %t1 = load <2 x float>* %a
+  %t1 = load <2 x float>, <2 x float>* %a
   %t2 = fmul <2 x float> %t1, %t1
   store <2 x float> %t2, <2 x float>* %a
   ret void
@@ -19,7 +19,7 @@
 define void @foo2(<4 x float>* %a) {
 ; CHECK: .func foo2
 ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
-  %t1 = load <4 x float>* %a
+  %t1 = load <4 x float>, <4 x float>* %a
   %t2 = fmul <4 x float> %t1, %t1
   store <4 x float> %t2, <4 x float>* %a
   ret void
@@ -29,7 +29,7 @@
 ; CHECK: .func foo3
 ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
 ; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
-  %t1 = load <8 x float>* %a
+  %t1 = load <8 x float>, <8 x float>* %a
   %t2 = fmul <8 x float> %t1, %t1
   store <8 x float> %t2, <8 x float>* %a
   ret void
@@ -40,7 +40,7 @@
 define void @foo4(<2 x i32>* %a) {
 ; CHECK: .func foo4
 ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
-  %t1 = load <2 x i32>* %a
+  %t1 = load <2 x i32>, <2 x i32>* %a
   %t2 = mul <2 x i32> %t1, %t1
   store <2 x i32> %t2, <2 x i32>* %a
   ret void
@@ -49,7 +49,7 @@
 define void @foo5(<4 x i32>* %a) {
 ; CHECK: .func foo5
 ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
-  %t1 = load <4 x i32>* %a
+  %t1 = load <4 x i32>, <4 x i32>* %a
   %t2 = mul <4 x i32> %t1, %t1
   store <4 x i32> %t2, <4 x i32>* %a
   ret void
@@ -59,7 +59,7 @@
 ; CHECK: .func foo6
 ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
 ; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
-  %t1 = load <8 x i32>* %a
+  %t1 = load <8 x i32>, <8 x i32>* %a
   %t2 = mul <8 x i32> %t1, %t1
   store <8 x i32> %t2, <8 x i32>* %a
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/vector-select.ll b/llvm/test/CodeGen/NVPTX/vector-select.ll
index 11893df..1e81031 100644
--- a/llvm/test/CodeGen/NVPTX/vector-select.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-select.ll
@@ -6,9 +6,9 @@
 
 define void @foo(<2 x i32> addrspace(1)* %def_a, <2 x i32> addrspace(1)* %def_b, <2 x i32> addrspace(1)* %def_c) {
 entry:
-  %tmp4 = load <2 x i32> addrspace(1)* %def_a
-  %tmp6 = load <2 x i32> addrspace(1)* %def_c
-  %tmp8 = load <2 x i32> addrspace(1)* %def_b
+  %tmp4 = load <2 x i32>, <2 x i32> addrspace(1)* %def_a
+  %tmp6 = load <2 x i32>, <2 x i32> addrspace(1)* %def_c
+  %tmp8 = load <2 x i32>, <2 x i32> addrspace(1)* %def_b
   %0 = icmp sge <2 x i32> %tmp4, zeroinitializer
   %cond = select <2 x i1> %0, <2 x i32> %tmp6, <2 x i32> %tmp8
   store <2 x i32> %cond, <2 x i32> addrspace(1)* %def_c
diff --git a/llvm/test/CodeGen/NVPTX/weak-global.ll b/llvm/test/CodeGen/NVPTX/weak-global.ll
index 2bef4c5..a64f9f4 100644
--- a/llvm/test/CodeGen/NVPTX/weak-global.ll
+++ b/llvm/test/CodeGen/NVPTX/weak-global.ll
@@ -4,6 +4,6 @@
 @g = common addrspace(1) global i32 zeroinitializer
 
 define i32 @func0() {
-  %val = load i32 addrspace(1)* @g
+  %val = load i32, i32 addrspace(1)* @g
   ret i32 %val
 }