[CUDA][HIP] Use device side kernel and variable names when registering them
__hipRegisterFunction and __hipRegisterVar need to accept device side kernel and variable names
so that HIP runtime can associate kernel stub functions in host code with kernel symbols in fat binaries,
and associate shadow variables in host code with device variables in fat binaries.
Currently, clang assumes kernel functions and device variables have the same name as the kernel
stub functions and shadow variables. However, when host is compiled in windows with MSVC C++
ABI and device is compiled with Itanium C++ ABI (e.g. AMDGPU), kernels and device symbols in fat
binary are mangled differently than host.
This patch gets the device side kernel and variable name by mangling them in the mangle context
of aux target.
Differential Revision: https://reviews.llvm.org/D58163
llvm-svn: 354004
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 68e83b9..6266103 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -42,14 +42,25 @@
/// Convenience reference to the current module
llvm::Module &TheModule;
/// Keeps track of kernel launch stubs emitted in this module
- llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
- llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
+ struct KernelInfo {
+ llvm::Function *Kernel;
+ const Decl *D;
+ };
+ llvm::SmallVector<KernelInfo, 16> EmittedKernels;
+ struct VarInfo {
+ llvm::GlobalVariable *Var;
+ const VarDecl *D;
+ unsigned Flag;
+ };
+ llvm::SmallVector<VarInfo, 16> DeviceVars;
/// Keeps track of variable containing handle of GPU binary. Populated by
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
/// ModuleDtorFunction()
llvm::GlobalVariable *GpuBinaryHandle = nullptr;
/// Whether we generate relocatable device code.
bool RelocatableDeviceCode;
+ /// Mangle context for device.
+ std::unique_ptr<MangleContext> DeviceMC;
llvm::FunctionCallee getSetupArgumentFn() const;
llvm::FunctionCallee getLaunchFn() const;
@@ -106,13 +117,15 @@
void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
+ std::string getDeviceSideName(const Decl *ND);
public:
CGNVCUDARuntime(CodeGenModule &CGM);
void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
- void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override {
- DeviceVars.push_back(std::make_pair(&Var, Flags));
+ void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
+ unsigned Flags) override {
+ DeviceVars.push_back({&Var, VD, Flags});
}
/// Creates module constructor function
@@ -138,7 +151,9 @@
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
: CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
TheModule(CGM.getModule()),
- RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode) {
+ RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
+ DeviceMC(CGM.getContext().createMangleContext(
+ CGM.getContext().getAuxTargetInfo())) {
CodeGen::CodeGenTypes &Types = CGM.getTypes();
ASTContext &Ctx = CGM.getContext();
@@ -187,9 +202,26 @@
return llvm::FunctionType::get(VoidTy, Params, false);
}
+std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) {
+ auto *ND = cast<const NamedDecl>(D);
+ std::string DeviceSideName;
+ if (DeviceMC->shouldMangleDeclName(ND)) {
+ SmallString<256> Buffer;
+ llvm::raw_svector_ostream Out(Buffer);
+ DeviceMC->mangleName(ND, Out);
+ DeviceSideName = Out.str();
+ } else
+ DeviceSideName = ND->getIdentifier()->getName();
+ return DeviceSideName;
+}
+
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
FunctionArgList &Args) {
- EmittedKernels.push_back(CGF.CurFn);
+ assert(getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() ||
+ CGF.CGM.getContext().getTargetInfo().getCXXABI() !=
+ CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI());
+
+ EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
CudaFeature::CUDA_USES_NEW_LAUNCH))
emitDeviceStubBodyNew(CGF, Args);
@@ -367,13 +399,19 @@
// __cuda_register_globals() and generate __cudaRegisterFunction() call for
// each emitted kernel.
llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
- for (llvm::Function *Kernel : EmittedKernels) {
- llvm::Constant *KernelName = makeConstantString(Kernel->getName());
+ for (auto &&I : EmittedKernels) {
+ llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D));
llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
llvm::Value *Args[] = {
- &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
- KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
- NullPtr, NullPtr, NullPtr,
+ &GpuBinaryHandlePtr,
+ Builder.CreateBitCast(I.Kernel, VoidPtrTy),
+ KernelName,
+ KernelName,
+ llvm::ConstantInt::get(IntTy, -1),
+ NullPtr,
+ NullPtr,
+ NullPtr,
+ NullPtr,
llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
Builder.CreateCall(RegisterFunc, Args);
}
@@ -386,10 +424,10 @@
llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy, RegisterVarParams, false),
addUnderscoredPrefixToName("RegisterVar"));
- for (auto &Pair : DeviceVars) {
- llvm::GlobalVariable *Var = Pair.first;
- unsigned Flags = Pair.second;
- llvm::Constant *VarName = makeConstantString(Var->getName());
+ for (auto &&Info : DeviceVars) {
+ llvm::GlobalVariable *Var = Info.Var;
+ unsigned Flags = Info.Flag;
+ llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
uint64_t VarSize =
CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
llvm::Value *Args[] = {
diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
index e0e096b..ada6734 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -23,6 +23,7 @@
namespace clang {
class CUDAKernelCallExpr;
+class VarDecl;
namespace CodeGen {
@@ -52,7 +53,8 @@
/// Emits a kernel launch stub.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
- virtual void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) = 0;
+ virtual void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
+ unsigned Flags) = 0;
/// Constructs and returns a module initialization function or nullptr if it's
/// not needed. Must be called after all kernels have been emitted.
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index ece26de..972d2af 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -3635,7 +3635,7 @@
// Extern global variables will be registered in the TU where they are
// defined.
if (!D->hasExternalStorage())
- getCUDARuntime().registerDeviceVar(*GV, Flags);
+ getCUDARuntime().registerDeviceVar(D, *GV, Flags);
} else if (D->hasAttr<CUDASharedAttr>())
// __shared__ variables are odd. Shadows do get created, but
// they are not registered with the CUDA runtime, so they