AMDGPU: Add a shader calling convention

This makes it possible to distinguish between mesa shaders
and other kernels even in the presence of compute shaders.

Patch By: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>

Differential Revision: http://reviews.llvm.org/D18559

llvm-svn: 265589
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
index e37e291..f0d23b9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@@ -4,7 +4,7 @@
 ;CHECK-LABEL: {{^}}image_load_v4i32:
 ;CHECK: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm
 ;CHECK: s_waitcnt vmcnt(0)
-define <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
+define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret <4 x float> %tex
@@ -13,7 +13,7 @@
 ;CHECK-LABEL: {{^}}image_load_v2i32:
 ;CHECK: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm
 ;CHECK: s_waitcnt vmcnt(0)
-define <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 {
+define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret <4 x float> %tex
@@ -22,7 +22,7 @@
 ;CHECK-LABEL: {{^}}image_load_i32:
 ;CHECK: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
 ;CHECK: s_waitcnt vmcnt(0)
-define <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 {
+define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret <4 x float> %tex
@@ -31,7 +31,7 @@
 ;CHECK-LABEL: {{^}}image_load_mip:
 ;CHECK: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
 ;CHECK: s_waitcnt vmcnt(0)
-define <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
+define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret <4 x float> %tex
@@ -40,7 +40,7 @@
 ;CHECK-LABEL: {{^}}image_load_1:
 ;CHECK: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
 ;CHECK: s_waitcnt vmcnt(0)
-define float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
+define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   %elt = extractelement <4 x float> %tex, i32 0
@@ -50,7 +50,7 @@
 
 ;CHECK-LABEL: {{^}}image_store_v4i32:
 ;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
-define void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
+define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
 main_body:
   call void @llvm.amdgcn.image.store.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret void
@@ -58,7 +58,7 @@
 
 ;CHECK-LABEL: {{^}}image_store_v2i32:
 ;CHECK: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
-define void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 {
+define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) {
 main_body:
   call void @llvm.amdgcn.image.store.v2i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret void
@@ -66,7 +66,7 @@
 
 ;CHECK-LABEL: {{^}}image_store_i32:
 ;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
-define void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 {
+define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) {
 main_body:
   call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret void
@@ -74,7 +74,7 @@
 
 ;CHECK-LABEL: {{^}}image_store_mip:
 ;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
-define void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
+define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
 main_body:
   call void @llvm.amdgcn.image.store.mip.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret void
@@ -88,7 +88,7 @@
 ;CHECK: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
-define void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) #0 {
+define amdgpu_ps void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) {
 main_body:
   call void @llvm.amdgcn.image.store.i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0)
   %data = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0)
@@ -96,16 +96,15 @@
   ret void
 }
 
-declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #1
-declare void @llvm.amdgcn.image.store.v2i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
-declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
-declare void @llvm.amdgcn.image.store.mip.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
+declare void @llvm.amdgcn.image.store.v2i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
+declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
+declare void @llvm.amdgcn.image.store.mip.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
 
-declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #2
-declare <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
-declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
-declare <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind readonly }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }