[CUDA] Include single GPU binary, NFCI.

Binaries for multiple architectures are combined by fatbinary,
so the current code was effectively not needed.

Differential Revision: https://reviews.llvm.org/D43461

llvm-svn: 326342
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index d24ef0a..4272aef 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -41,10 +41,10 @@
   /// Keeps track of kernel launch stubs emitted in this module
   llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
   llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
-  /// Keeps track of variables containing handles of GPU binaries. Populated by
+  /// Keeps track of variable containing handle of GPU binary. Populated by
   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
   /// ModuleDtorFunction()
-  llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles;
+  llvm::GlobalVariable *GpuBinaryHandle = nullptr;
 
   llvm::Constant *getSetupArgumentFn() const;
   llvm::Constant *getLaunchFn() const;
@@ -245,16 +245,14 @@
 /// Creates a global constructor function for the module:
 /// \code
 /// void __cuda_module_ctor(void*) {
-///     Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
-///     __cuda_register_globals(Handle0);
-///     ...
-///     HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
-///     __cuda_register_globals(HandleN);
+///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
+///     __cuda_register_globals(Handle);
 /// }
 /// \endcode
 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
-  // No need to generate ctors/dtors if there are no GPU binaries.
-  if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty())
+  // No need to generate ctors/dtors if there is no GPU binary.
+  std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
+  if (GpuBinaryFileName.empty())
     return nullptr;
 
   // void __cuda_register_globals(void* handle);
@@ -267,6 +265,18 @@
   llvm::StructType *FatbinWrapperTy =
       llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
 
+  // Register GPU binary with the CUDA runtime, store returned handle in a
+  // global variable and save a reference in GpuBinaryHandle to be cleaned up
+  // in destructor on exit. Then associate all known kernels with the GPU binary
+  // handle so CUDA runtime can figure out what to call on the GPU side.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
+  if (std::error_code EC = GpuBinaryOrErr.getError()) {
+    CGM.getDiags().Report(diag::err_cannot_open_file)
+        << GpuBinaryFileName << EC.message();
+    return nullptr;
+  }
+
   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
       llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);
@@ -276,79 +286,56 @@
 
   CtorBuilder.SetInsertPoint(CtorEntryBB);
 
-  // For each GPU binary, register it with the CUDA runtime and store returned
-  // handle in a global variable and save the handle in GpuBinaryHandles vector
-  // to be cleaned up in destructor on exit. Then associate all known kernels
-  // with the GPU binary handle so CUDA runtime can figure out what to call on
-  // the GPU side.
-  for (const std::string &GpuBinaryFileName :
-       CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
-    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
-        llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
-    if (std::error_code EC = GpuBinaryOrErr.getError()) {
-      CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName
-                                                        << EC.message();
-      continue;
-    }
+  const char *FatbinConstantName =
+      CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+  // NVIDIA's cuobjdump looks for fatbins in this section.
+  const char *FatbinSectionName =
+      CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
 
-    const char *FatbinConstantName =
-        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
-    // NVIDIA's cuobjdump looks for fatbins in this section.
-    const char *FatbinSectionName =
-        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+  // Create initialized wrapper structure that points to the loaded GPU binary
+  ConstantInitBuilder Builder(CGM);
+  auto Values = Builder.beginStruct(FatbinWrapperTy);
+  // Fatbin wrapper magic.
+  Values.addInt(IntTy, 0x466243b1);
+  // Fatbin version.
+  Values.addInt(IntTy, 1);
+  // Data.
+  Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
+                                FatbinConstantName, 8));
+  // Unused in fatbin v1.
+  Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
+  llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
+      "__cuda_fatbin_wrapper", CGM.getPointerAlign(),
+      /*constant*/ true);
+  FatbinWrapper->setSection(FatbinSectionName);
 
-    // Create initialized wrapper structure that points to the loaded GPU binary
-    ConstantInitBuilder Builder(CGM);
-    auto Values = Builder.beginStruct(FatbinWrapperTy);
-    // Fatbin wrapper magic.
-    Values.addInt(IntTy, 0x466243b1);
-    // Fatbin version.
-    Values.addInt(IntTy, 1);
-    // Data.
-    Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), 
-                                  "", FatbinConstantName, 8));
-    // Unused in fatbin v1.
-    Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
-    llvm::GlobalVariable *FatbinWrapper =
-      Values.finishAndCreateGlobal("__cuda_fatbin_wrapper",
-                                   CGM.getPointerAlign(),
-                                   /*constant*/ true);
-    FatbinWrapper->setSection(FatbinSectionName);
+  // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
+  llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
+      RegisterFatbinFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
+  GpuBinaryHandle = new llvm::GlobalVariable(
+      TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
+      llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
+  CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
+                                 CGM.getPointerAlign());
 
-    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
-    llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
-        RegisterFatbinFunc,
-        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
-    llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable(
-        TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
-    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
-                                   CGM.getPointerAlign());
-
-    // Call __cuda_register_globals(GpuBinaryHandle);
-    if (RegisterGlobalsFunc)
-      CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
-
-    // Save GpuBinaryHandle so we can unregister it in destructor.
-    GpuBinaryHandles.push_back(GpuBinaryHandle);
-  }
+  // Call __cuda_register_globals(GpuBinaryHandle);
+  if (RegisterGlobalsFunc)
+    CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
 
   CtorBuilder.CreateRetVoid();
   return ModuleCtorFunc;
 }
 
-/// Creates a global destructor function that unregisters all GPU code blobs
+/// Creates a global destructor function that unregisters the GPU code blob
 /// registered by constructor.
 /// \code
 /// void __cuda_module_dtor(void*) {
-///     __cudaUnregisterFatBinary(Handle0);
-///     ...
-///     __cudaUnregisterFatBinary(HandleN);
+///     __cudaUnregisterFatBinary(Handle);
 /// }
 /// \endcode
 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
-  // No need for destructor if we don't have handles to unregister.
-  if (GpuBinaryHandles.empty())
+  // No need for destructor if we don't have a handle to unregister.
+  if (!GpuBinaryHandle)
     return nullptr;
 
   // void __cudaUnregisterFatBinary(void ** handle);
@@ -364,11 +351,9 @@
   CGBuilderTy DtorBuilder(CGM, Context);
   DtorBuilder.SetInsertPoint(DtorEntryBB);
 
-  for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) {
-    auto HandleValue =
+  auto HandleValue =
       DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
-    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
-  }
+  DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
 
   DtorBuilder.CreateRetVoid();
   return ModuleDtorFunc;