[CUDA] add support for the new kernel launch API in CUDA-9.2+.

Instead of calling CUDA runtime to arrange function arguments,
the new API constructs arguments in a local array and the kernels
are launched with __cudaLaunchKernel().

The old API has been deprecated and is expected to go away
in the next CUDA release.

Differential Revision: https://reviews.llvm.org/D57488

llvm-svn: 352799
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 0678f14..9aaa5f7 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -15,6 +15,8 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "clang/AST/Decl.h"
+#include "clang/Basic/Cuda.h"
+#include "clang/CodeGen/CodeGenABITypes.h"
 #include "clang/CodeGen/ConstantInitBuilder.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -102,7 +104,8 @@
     return DummyFunc;
   }
 
-  void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
+  void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
+  void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
 
 public:
   CGNVCUDARuntime(CodeGenModule &CGM);
@@ -187,11 +190,110 @@
 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
                                      FunctionArgList &Args) {
   EmittedKernels.push_back(CGF.CurFn);
-  emitDeviceStubBody(CGF, Args);
+  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
+                         CudaFeature::CUDA_USES_NEW_LAUNCH))
+    emitDeviceStubBodyNew(CGF, Args);
+  else
+    emitDeviceStubBodyLegacy(CGF, Args);
 }
 
-void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
-                                         FunctionArgList &Args) {
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
+                                            FunctionArgList &Args) {
+  // Build the shadow stack entry at the very start of the function.
+
+  // Calculate amount of space we will need for all arguments.  If we have no
+  // args, allocate a single pointer so we still have a valid pointer to the
+  // argument array that we can pass to runtime, even if it will be unused.
+  Address KernelArgs = CGF.CreateTempAlloca(
+      VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
+      llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
+  // Store pointers to the arguments in a locally allocated launch_args.
+  for (unsigned i = 0; i < Args.size(); ++i) {
+    llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
+    llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
+    CGF.Builder.CreateDefaultAlignedStore(
+        VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
+  }
+
+  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
+
+  // Lookup cudaLaunchKernel function.
+  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+  //                              void **args, size_t sharedMem,
+  //                              cudaStream_t stream);
+  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
+  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
+  IdentifierInfo &cudaLaunchKernelII =
+      CGM.getContext().Idents.get("cudaLaunchKernel");
+  FunctionDecl *cudaLaunchKernelFD = nullptr;
+  for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
+    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
+      cudaLaunchKernelFD = FD;
+  }
+
+  if (cudaLaunchKernelFD == nullptr) {
+    CGM.Error(CGF.CurFuncDecl->getLocation(),
+              "Can't find declaration for cudaLaunchKernel()");
+    return;
+  }
+  // Create temporary dim3 grid_dim, block_dim.
+  ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
+  QualType Dim3Ty = GridDimParam->getType();
+  Address GridDim =
+      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
+  Address BlockDim =
+      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
+  Address ShmemSize =
+      CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
+  Address Stream =
+      CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
+  llvm::Constant *cudaPopConfigFn = CGM.CreateRuntimeFunction(
+      llvm::FunctionType::get(IntTy,
+                              {/*gridDim=*/GridDim.getType(),
+                               /*blockDim=*/BlockDim.getType(),
+                               /*ShmemSize=*/ShmemSize.getType(),
+                               /*Stream=*/Stream.getType()},
+                              /*isVarArg=*/false),
+      "__cudaPopCallConfiguration");
+
+  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
+                              {GridDim.getPointer(), BlockDim.getPointer(),
+                               ShmemSize.getPointer(), Stream.getPointer()});
+
+  // Emit the call to cudaLaunch
+  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
+  CallArgList LaunchKernelArgs;
+  LaunchKernelArgs.add(RValue::get(Kernel),
+                       cudaLaunchKernelFD->getParamDecl(0)->getType());
+  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
+  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
+  LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
+                       cudaLaunchKernelFD->getParamDecl(3)->getType());
+  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
+                       cudaLaunchKernelFD->getParamDecl(4)->getType());
+  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
+                       cudaLaunchKernelFD->getParamDecl(5)->getType());
+
+  QualType QT = cudaLaunchKernelFD->getType();
+  QualType CQT = QT.getCanonicalType();
+  llvm::Type *Ty = CGM.getTypes().ConvertFunctionType(CQT, cudaLaunchKernelFD);
+  llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
+
+  const CGFunctionInfo &FI =
+      CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
+  llvm::Constant *cudaLaunchKernelFn =
+      CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel");
+  CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
+               LaunchKernelArgs);
+  CGF.EmitBranch(EndBlock);
+
+  CGF.EmitBlock(EndBlock);
+}
+
+void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
+                                               FunctionArgList &Args) {
   // Emit a call to cudaSetupArgument for each arg in Args.
   llvm::Constant *cudaSetupArgFn = getSetupArgumentFn();
   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
diff --git a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
index f05c045..4b36724 100644
--- a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -426,5 +426,15 @@
 #pragma pop_macro("__USE_FAST_MATH__")
 #pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
 
+// CUDA runtime uses this undocumented function to access kernel launch
+// configuration. The declaration is in crt/device_functions.h but that file
+// includes a lot of other stuff we don't want. Instead, we'll provide our own
+// declaration for it here.
+#if CUDA_VERSION >= 9020
+extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+                                                size_t sharedMem = 0,
+                                                void *stream = 0);
+#endif
+
 #endif // __CUDA__
 #endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index ec926ea..43cc14d 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -13,6 +13,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/Basic/Cuda.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Sema.h"
@@ -41,9 +42,8 @@
                                          SourceLocation GGGLoc) {
   FunctionDecl *ConfigDecl = Context.getcudaConfigureCallDecl();
   if (!ConfigDecl)
-    return ExprError(
-        Diag(LLLLoc, diag::err_undeclared_var_use)
-        << (getLangOpts().HIP ? "hipConfigureCall" : "cudaConfigureCall"));
+    return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
+                     << getCudaConfigureFuncName());
   QualType ConfigQTy = ConfigDecl->getType();
 
   DeclRefExpr *ConfigDR = new (Context)
@@ -957,3 +957,16 @@
   copyAttrIfPresent<CUDAHostAttr>(*this, FD, TemplateFD);
   copyAttrIfPresent<CUDADeviceAttr>(*this, FD, TemplateFD);
 }
+
+std::string Sema::getCudaConfigureFuncName() const {
+  if (getLangOpts().HIP)
+    return "hipConfigureCall";
+
+  // New CUDA kernel launch sequence.
+  if (CudaFeatureEnabled(Context.getTargetInfo().getSDKVersion(),
+                         CudaFeature::CUDA_USES_NEW_LAUNCH))
+    return "__cudaPushCallConfiguration";
+
+  // Legacy CUDA kernel configuration call
+  return "cudaConfigureCall";
+}
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 38a7330..112184d 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -9146,13 +9146,12 @@
 
   if (getLangOpts().CUDA) {
     IdentifierInfo *II = NewFD->getIdentifier();
-    if (II &&
-        II->isStr(getLangOpts().HIP ? "hipConfigureCall"
-                                    : "cudaConfigureCall") &&
+    if (II && II->isStr(getCudaConfigureFuncName()) &&
         !NewFD->isInvalidDecl() &&
         NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
       if (!R->getAs<FunctionType>()->getReturnType()->isScalarType())
-        Diag(NewFD->getLocation(), diag::err_config_scalar_return);
+        Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+            << getCudaConfigureFuncName();
       Context.setcudaConfigureCallDecl(NewFD);
     }