AMDGPU/SI: Support data types other than V4f32 in image intrinsics

Summary:
  Extend image intrinsics to support data types of V1F32 and V2F32.

  TODO: we should define a mapping table to change the opcode for data type of V2F32 but just one channel is active,
  even though such case should be very rare.

Reviewers:
  tstellarAMD

Differential Revision:
  http://reviews.llvm.org/D26472

llvm-svn: 286860
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
index 854793e..a65f422 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
@@ -320,6 +320,23 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}gather4_f32:
+; GCN: image_gather4 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define void @gather4_f32(float addrspace(1)* %out) {
+main_body:
+  %r = call float @llvm.amdgcn.image.gather4.f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}gather4_v2f32:
+; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
+define void @gather4_v2f32(<2 x float> addrspace(1)* %out) {
+main_body:
+  %r = call <2 x float> @llvm.amdgcn.image.gather4.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 1)
+  store <2 x float> %r, <2 x float> addrspace(1)* %out
+  ret void
+}
 
 declare <4 x float> @llvm.amdgcn.image.gather4.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.gather4.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
@@ -360,5 +377,7 @@
 declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v8f32.v8i32(<8 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 
+declare float @llvm.amdgcn.image.gather4.f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
+declare <2 x float> @llvm.amdgcn.image.gather4.v2f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 
 attributes #0 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
index 5fe03f0..faef191 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@@ -48,6 +48,25 @@
   ret float %elt
 }
 
+;CHECK-LABEL: {{^}}image_load_f32_v2i32:
+;CHECK: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) {
+main_body:
+  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0)
+  ret float %tex
+}
+
+;CHECK-LABEL: {{^}}image_load_v2f32_v4i32:
+;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+main_body:
+  %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0)
+  ret <2 x float> %tex
+}
+
+
 ;CHECK-LABEL: {{^}}image_store_v4i32:
 ;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
 define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
@@ -72,6 +91,22 @@
   ret void
 }
 
+;CHECK-LABEL: {{^}}image_store_f32_i32:
+;CHECK: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
+define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) {
+main_body:
+  call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}image_store_v2f32_v4i32:
+;CHECK: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
+define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) {
+main_body:
+  call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0)
+  ret void
+}
+
 ;CHECK-LABEL: {{^}}image_store_mip:
 ;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
 define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
@@ -93,7 +128,6 @@
   ret void
 }
 
-
 ; Ideally, the register allocator would avoid the wait here
 ;
 ;CHECK-LABEL: {{^}}image_store_wait:
@@ -110,6 +144,13 @@
   ret void
 }
 
+
+declare float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare void @llvm.amdgcn.image.store.f32.i32.v8i32(float, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
+declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
+
+
 declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
 declare void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
index 7a1ef0b..752ec2d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
@@ -181,6 +181,23 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}sample_f32:
+; GCN: image_sample {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1
+define void @sample_f32(float addrspace(1)* %out) {
+main_body:
+  %r = call float @llvm.amdgcn.image.sample.f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0)
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}sample_v2f32:
+; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3
+define void @sample_v2f32(<2 x float> addrspace(1)* %out) {
+main_body:
+  %r = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0)
+  store <2 x float> %r, <2 x float> addrspace(1)* %out
+  ret void
+}
 
 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
@@ -204,5 +221,7 @@
 declare <4 x float> @llvm.amdgcn.image.sample.c.cd.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 
+declare float @llvm.amdgcn.image.sample.f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
+declare <2 x float> @llvm.amdgcn.image.sample.v2f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 
 attributes #0 = { nounwind readnone }