[AMDGPU] add support for hostcall buffer pointer as hidden kernel argument

Hostcall is a service that allows a kernel to submit requests to the
host using shared buffers, and block until a response is
received. This will eventually replace the shared buffer currently
used for printf, and repurposes the same hidden kernel argument. This
change introduces a new ValueKind in the HSA metadata to represent the
hostcall buffer.

Differential Revision: https://reviews.llvm.org/D70038
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index fc624a7..758e65c 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1382,6 +1382,11 @@
                                                   to the runtime printf buffer
                                                   is passed in kernarg.
 
+                                                "HiddenHostcallBuffer"
+                                                  A global address space pointer
+                                                  to the runtime hostcall buffer
+                                                  is passed in kernarg.
+
                                                 "HiddenDefaultQueue"
                                                   A global address space pointer
                                                   to the OpenCL device enqueue
@@ -1876,6 +1881,11 @@
                                                        to the runtime printf buffer
                                                        is passed in kernarg.
 
+                                                     "hidden_hostcall_buffer"
+                                                       A global address space pointer
+                                                       to the runtime hostcall buffer
+                                                       is passed in kernarg.
+
                                                      "hidden_default_queue"
                                                        A global address space pointer
                                                        to the OpenCL device enqueue
diff --git a/llvm/include/llvm/Support/AMDGPUMetadata.h b/llvm/include/llvm/Support/AMDGPUMetadata.h
index f7f1ec4..eeef4e6 100644
--- a/llvm/include/llvm/Support/AMDGPUMetadata.h
+++ b/llvm/include/llvm/Support/AMDGPUMetadata.h
@@ -75,6 +75,7 @@
   HiddenDefaultQueue     = 12,
   HiddenCompletionAction = 13,
   HiddenMultiGridSyncArg = 14,
+  HiddenHostcallBuffer   = 15,
   Unknown                = 0xff
 };
 
diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index 3f36dff..d927171 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -119,6 +119,7 @@
                                .Case("hidden_global_offset_z", true)
                                .Case("hidden_none", true)
                                .Case("hidden_printf_buffer", true)
+                               .Case("hidden_hostcall_buffer", true)
                                .Case("hidden_default_queue", true)
                                .Case("hidden_completion_action", true)
                                .Case("hidden_multigrid_sync_arg", true)
diff --git a/llvm/lib/Support/AMDGPUMetadata.cpp b/llvm/lib/Support/AMDGPUMetadata.cpp
index 5f81022..4ea197a 100644
--- a/llvm/lib/Support/AMDGPUMetadata.cpp
+++ b/llvm/lib/Support/AMDGPUMetadata.cpp
@@ -62,6 +62,7 @@
     YIO.enumCase(EN, "HiddenGlobalOffsetZ", ValueKind::HiddenGlobalOffsetZ);
     YIO.enumCase(EN, "HiddenNone", ValueKind::HiddenNone);
     YIO.enumCase(EN, "HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer);
+    YIO.enumCase(EN, "HiddenHostcallBuffer", ValueKind::HiddenHostcallBuffer);
     YIO.enumCase(EN, "HiddenDefaultQueue", ValueKind::HiddenDefaultQueue);
     YIO.enumCase(EN, "HiddenCompletionAction",
                  ValueKind::HiddenCompletionAction);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 9f5bcd8..511d629 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -421,7 +421,12 @@
   if (HiddenArgNumBytes >= 32) {
     if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
       emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
-    else
+    else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) {
+      // The printf runtime binding pass should have ensured that hostcall and
+      // printf are not used in the same module.
+      assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts"));
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenHostcallBuffer);
+    } else
       emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
   }
 
@@ -854,7 +859,12 @@
   if (HiddenArgNumBytes >= 32) {
     if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
       emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, Args);
-    else
+    else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) {
+      // The printf runtime binding pass should have ensured that hostcall and
+      // printf are not used in the same module.
+      assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts"));
+      emitKernelArg(DL, Int8PtrTy, "hidden_hostcall_buffer", Offset, Args);
+    } else
       emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 1f44012..511de96 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -582,6 +582,15 @@
   if (Printfs.empty())
     return false;
 
+  if (auto HostcallFunction = M.getFunction("__ockl_hostcall_internal")) {
+    for (auto &U : HostcallFunction->uses()) {
+      if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
+        M.getContext().emitError(
+            CI, "Cannot use both printf and hostcall in the same module");
+      }
+    }
+  }
+
   TD = &M.getDataLayout();
   auto DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
index 72e96a1..9d34d41 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
@@ -42,6 +42,7 @@
 ; CHECK-NEXT:         .size:           8
 ; CHECK-NOT:          .value_kind:     hidden_default_queue
 ; CHECK-NOT:          .value_kind:     hidden_completion_action
+; CHECK-NOT:          .value_kind:     hidden_hostcall_buffer
 ; CHECK-NEXT:         .value_kind:     hidden_printf_buffer
 ; CHECK-NEXT:         .value_type:     i8
 ; CHECK:              .value_kind:     hidden_multigrid_sync_arg
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
index db1d5c5..f5a47a8 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
@@ -49,6 +49,7 @@
 ; CHECK-NEXT:       ValueType:     I64
 ; CHECK-NEXT:     - Size:          8
 ; CHECK-NEXT:       Align:         8
+; CHECK-NOT:        ValueKind:     HiddenHostcallBuffer
 ; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
 ; CHECK-NEXT:       ValueType:     I8
 ; CHECK-NEXT:       AddrSpaceQual: Global
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent-v3.ll
new file mode 100644
index 0000000..8741bfb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent-v3.ll
@@ -0,0 +1,55 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+code-object-v3 -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+code-object-v3 -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+; CHECK:              ---
+; CHECK:      amdhsa.kernels:
+; CHECK:        - .args:
+; CHECK-NEXT:       - .name:           a
+; CHECK-NEXT:         .offset:         0
+; CHECK-NEXT:         .size:           1
+; CHECK-NEXT:         .type_name:      char
+; CHECK-NEXT:         .value_kind:     by_value
+; CHECK-NEXT:         .value_type:     i8
+; CHECK-NEXT:       - .offset:         8
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:         .value_type:     i64
+; CHECK-NEXT:       - .offset:         16
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:         .value_type:     i64
+; CHECK-NEXT:       - .offset:         24
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:         .value_type:     i64
+
+; CHECK-NOT:          .value_kind:     hidden_hostcall_buffer
+
+; CHECK:          .language:       OpenCL C
+; CHECK-NEXT:     .language_version:
+; CHECK-NEXT:       - 2
+; CHECK-NEXT:       - 0
+; CHECK:          .name:           test_kernel
+; CHECK:          .symbol:         test_kernel.kd
+
+define amdgpu_kernel void @test_kernel(i8 %a) #0
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+
+attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"char"}
+!4 = !{!""}
+
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent.ll
new file mode 100644
index 0000000..5f1cda0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+; CHECK: ---
+; CHECK:  Version: [ 1, 0 ]
+; CHECK:  Kernels:
+
+; CHECK:      - Name:            test_kernel
+; CHECK-NEXT:   SymbolName:      'test_kernel@kd'
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Name:          a
+; CHECK-NEXT:       TypeName:      char
+; CHECK-NEXT:       Size:          1
+; CHECK-NEXT:       Align:         1
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NOT:        ValueKind:     HiddenHostcallBuffer
+; CHECK-NOT:        ValueKind:     HiddenDefaultQueue
+; CHECK-NOT:        ValueKind:     HiddenCompletionAction
+
+define amdgpu_kernel void @test_kernel(i8 %a) #0
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"char"}
+!4 = !{!""}
+
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3.ll
new file mode 100644
index 0000000..1a75f36
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+code-object-v3 -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+code-object-v3 -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+; CHECK:              ---
+; CHECK:      amdhsa.kernels:
+; CHECK:        - .args:
+; CHECK-NEXT:       - .name:           a
+; CHECK-NEXT:         .offset:         0
+; CHECK-NEXT:         .size:           1
+; CHECK-NEXT:         .type_name:      char
+; CHECK-NEXT:         .value_kind:     by_value
+; CHECK-NEXT:         .value_type:     i8
+; CHECK-NEXT:       - .offset:         8
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:         .value_type:     i64
+; CHECK-NEXT:       - .offset:         16
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:         .value_type:     i64
+; CHECK-NEXT:       - .offset:         24
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:         .value_type:     i64
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .offset:         32
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     hidden_hostcall_buffer
+; CHECK-NEXT:         .value_type:     i8
+; CHECK:          .language:       OpenCL C
+; CHECK-NEXT:     .language_version:
+; CHECK-NEXT:       - 2
+; CHECK-NEXT:       - 0
+; CHECK:          .name:           test_kernel
+; CHECK:          .symbol:         test_kernel.kd
+
+declare <2 x i64> @__ockl_hostcall_internal(i8*, i32, i64, i64, i64, i64, i64, i64, i64, i64)
+
+define amdgpu_kernel void @test_kernel(i8 %a) #0
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+
+attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"char"}
+!4 = !{!""}
+
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present.ll
new file mode 100644
index 0000000..b042863
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present.ll
@@ -0,0 +1,58 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+; CHECK: ---
+; CHECK:  Version: [ 1, 0 ]
+; CHECK:  Kernels:
+
+; CHECK:      - Name:            test_kernel
+; CHECK-NEXT:   SymbolName:      'test_kernel@kd'
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Name:          a
+; CHECK-NEXT:       TypeName:      char
+; CHECK-NEXT:       Size:          1
+; CHECK-NEXT:       Align:         1
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:            8
+; CHECK-NEXT:       Align:           8
+; CHECK-NEXT:       ValueKind:       HiddenHostcallBuffer
+; CHECK-NEXT:       ValueType:       I8
+; CHECK-NEXT:       AddrSpaceQual:   Global
+; CHECK-NOT:        ValueKind:     HiddenDefaultQueue
+; CHECK-NOT:        ValueKind:     HiddenCompletionAction
+
+declare <2 x i64> @__ockl_hostcall_internal(i8*, i32, i64, i64, i64, i64, i64, i64, i64, i64)
+
+define amdgpu_kernel void @test_kernel(i8 %a) #0
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"char"}
+!4 = !{!""}
+
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
diff --git a/llvm/test/CodeGen/AMDGPU/opencl-printf-no-hostcall.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf-no-hostcall.ll
new file mode 100644
index 0000000..14c2976
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/opencl-printf-no-hostcall.ll
@@ -0,0 +1,18 @@
+; RUN: not opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-printf-runtime-binding < %s 2>&1 | FileCheck %s
+
+@.str = private unnamed_addr addrspace(2) constant [6 x i8] c"%s:%d\00", align 1
+
+define amdgpu_kernel void @test_kernel(i32 %n) {
+entry:
+  %str = alloca [9 x i8], align 1
+  %arraydecay = getelementptr inbounds [9 x i8], [9 x i8]* %str, i32 0, i32 0
+  %call1 = call i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str, i32 0, i32 0), i8* %arraydecay, i32 %n)
+  %call2 = call <2 x i64> (i8*, i32, i64, i64, i64, i64, i64, i64, i64, i64) @__ockl_hostcall_internal(i8* undef, i32 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9)
+  ret void
+}
+
+declare i32 @printf(i8 addrspace(2)*, ...)
+
+declare <2 x i64> @__ockl_hostcall_internal(i8*, i32, i64, i64, i64, i64, i64, i64, i64, i64)
+
+; CHECK: error: Cannot use both printf and hostcall in the same module