AMDGPU/SI: Support data types other than V4f32 in image intrinsics

Summary:
  Extend image intrinsics to support data types of V1F32 and V2F32.

  TODO: we should define a mapping table to change the opcode for data type of V2F32 but just one channel is active,
  even though such case should be very rare.

Reviewers:
  tstellarAMD

Differential Revision:
  http://reviews.llvm.org/D26472

llvm-svn: 286860
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
index 5fe03f0..faef191 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@@ -48,6 +48,25 @@
   ret float %elt
 }
 
+;CHECK-LABEL: {{^}}image_load_f32_v2i32:
+;CHECK: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) {
+main_body:
+  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0)
+  ret float %tex
+}
+
+;CHECK-LABEL: {{^}}image_load_v2f32_v4i32:
+;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+main_body:
+  %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0)
+  ret <2 x float> %tex
+}
+
+
 ;CHECK-LABEL: {{^}}image_store_v4i32:
 ;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
 define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
@@ -72,6 +91,22 @@
   ret void
 }
 
+;CHECK-LABEL: {{^}}image_store_f32_i32:
+;CHECK: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
+define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) {
+main_body:
+  call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}image_store_v2f32_v4i32:
+;CHECK: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
+define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) {
+main_body:
+  call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0)
+  ret void
+}
+
 ;CHECK-LABEL: {{^}}image_store_mip:
 ;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
 define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
@@ -93,7 +128,6 @@
   ret void
 }
 
-
 ; Ideally, the register allocator would avoid the wait here
 ;
 ;CHECK-LABEL: {{^}}image_store_wait:
@@ -110,6 +144,13 @@
   ret void
 }
 
+
+declare float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare void @llvm.amdgcn.image.store.f32.i32.v8i32(float, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
+declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
+
+
 declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
 declare void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0