[HIP] Support early finalization of device code for -fno-gpu-rdc This patch renames -f{no-}cuda-rdc to -f{no-}gpu-rdc and keeps the original options as aliases. When -fgpu-rdc is off, clang will assume the device code in each translation unit does not call external functions except those in the device library, therefore it is possible to compile the device code in each translation unit to self-contained kernels and embed them in the host object, so that the host object behaves like usual host object which can be linked by lld. The benefits of this feature is: 1. allow users to create static libraries which can be linked by host linker; 2. amortized device code linking time. This patch modifies HIP action builder to insert actions for linking device code and generating HIP fatbin, and pass HIP fatbin to host backend action. It extracts code for constructing command for generating HIP fatbin as a function so that it can be reused by early finalization. It also modifies codegen of HIP host constructor functions to embed the device fatbin when it is available. Differential Revision: https://reviews.llvm.org/D52377 llvm-svn: 343611

commit: 9767089d003b52ec9d32b93c8533c815c9906902 [log] [tgz]
author: Yaxun Liu <Yaxun.Liu@amd.com> Tue Oct 02 17:48:54 2018 +0000
committer: Yaxun Liu <Yaxun.Liu@amd.com> Tue Oct 02 17:48:54 2018 +0000
tree: e9788366c4929488c5cc4c31054fa4ebc749b0f9
parent: 2b5259afb3ce81334fab0f4d7a741b85f465caf7 [diff]
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 3ca31ab..070abd8 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp

@@ -4920,16 +4920,16 @@
     CmdArgs.push_back(Args.MakeArgString(Flags));
   }
 
-  if (IsCuda) {
-    // Host-side cuda compilation receives all device-side outputs in a single
-    // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
-    if (CudaDeviceInput) {
+  // Host-side cuda compilation receives all device-side outputs in a single
+  // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
+  if ((IsCuda || IsHIP) && CudaDeviceInput) {
       CmdArgs.push_back("-fcuda-include-gpubinary");
       CmdArgs.push_back(CudaDeviceInput->getFilename());
-    }
+      if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+        CmdArgs.push_back("-fgpu-rdc");
+  }
 
-    if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
-      CmdArgs.push_back("-fcuda-rdc");
+  if (IsCuda) {
     if (Args.hasFlag(options::OPT_fcuda_short_ptr,
                      options::OPT_fno_cuda_short_ptr, false))
       CmdArgs.push_back("-fcuda-short-ptr");

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 9e6aeea..355e8d4 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp

@@ -15,6 +15,7 @@
 #include "Arch/SystemZ.h"
 #include "Arch/X86.h"
 #include "Hexagon.h"
+#include "HIP.h"
 #include "InputInfo.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/LangOptions.h"
@@ -1337,6 +1338,18 @@
   if (!JA.isHostOffloading(Action::OFK_HIP))
     return;
 
+  InputInfoList DeviceInputs;
+  for (const auto &II : Inputs) {
+    const Action *A = II.getAction();
+    // Is this a device linking action?
+    if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
+      DeviceInputs.push_back(II);
+    }
+  }
+
+  if (DeviceInputs.empty())
+    return;
+
   // Create temporary linker script. Keep it if save-temps is enabled.
   const char *LKS;
   SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
@@ -1364,39 +1377,12 @@
          "Wrong platform");
   (void)HIPTC;
 
-  // Construct clang-offload-bundler command to bundle object files for
-  // for different GPU archs.
-  ArgStringList BundlerArgs;
-  BundlerArgs.push_back(Args.MakeArgString("-type=o"));
-
-  // ToDo: Remove the dummy host binary entry which is required by
-  // clang-offload-bundler.
-  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
-  std::string BundlerInputArg = "-inputs=/dev/null";
-
-  for (const auto &II : Inputs) {
-    const Action *A = II.getAction();
-    // Is this a device linking action?
-    if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
-      BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
-                         StringRef(A->getOffloadingArch()).str();
-      BundlerInputArg = BundlerInputArg + "," + II.getFilename();
-    }
-  }
-  BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
-  BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
-
-  std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o");
+  // The output file name needs to persist through the compilation, therefore
+  // it needs to be created through MakeArgString.
+  std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "hipfb");
   const char *BundleFile =
       C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str()));
-  auto BundlerOutputArg =
-      Args.MakeArgString(std::string("-outputs=").append(BundleFile));
-  BundlerArgs.push_back(BundlerOutputArg);
-
-  SmallString<128> BundlerPath(C.getDriver().Dir);
-  llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
-  const char *Bundler = Args.MakeArgString(BundlerPath);
-  C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+  AMDGCN::constructHIPFatbinCommand(C, JA, BundleFile, DeviceInputs, Args, T);
 
   // Add commands to embed target binaries. We ensure that each section and
   // image is 16-byte aligned. This is not mandatory, but increases the

diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 08e49fa..ddcb008 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp

@@ -398,8 +398,8 @@
                                options::OPT_fnoopenmp_relocatable_target,
                                /*Default=*/true);
   else if (JA.isOffloading(Action::OFK_Cuda))
-    Relocatable = Args.hasFlag(options::OPT_fcuda_rdc,
-                               options::OPT_fno_cuda_rdc, /*Default=*/false);
+    Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+                               options::OPT_fno_gpu_rdc, /*Default=*/false);
 
   if (Relocatable)
     CmdArgs.push_back("-c");
@@ -609,9 +609,9 @@
                            options::OPT_fno_cuda_approx_transcendentals, false))
       CC1Args.push_back("-fcuda-approx-transcendentals");
 
-    if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+    if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
                            false))
-      CC1Args.push_back("-fcuda-rdc");
+      CC1Args.push_back("-fgpu-rdc");
   }
 
   if (DriverArgs.hasArg(options::OPT_nocudalib))

diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index 6efcfae..58e8e79 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp

@@ -184,6 +184,40 @@
   C.addCommand(llvm::make_unique<Command>(JA, *this, Lld, LldArgs, Inputs));
 }
 
+// Construct a clang-offload-bundler command to bundle code objects for
+// different GPU's into a HIP fat binary.
+void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
+                  StringRef OutputFileName, const InputInfoList &Inputs,
+                  const llvm::opt::ArgList &Args, const Tool& T) {
+  // Construct clang-offload-bundler command to bundle object files for
+  // for different GPU archs.
+  ArgStringList BundlerArgs;
+  BundlerArgs.push_back(Args.MakeArgString("-type=o"));
+
+  // ToDo: Remove the dummy host binary entry which is required by
+  // clang-offload-bundler.
+  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
+  std::string BundlerInputArg = "-inputs=/dev/null";
+
+  for (const auto &II : Inputs) {
+    const auto* A = II.getAction();
+    BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
+                       StringRef(A->getOffloadingArch()).str();
+    BundlerInputArg = BundlerInputArg + "," + II.getFilename();
+  }
+  BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
+  BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
+
+  auto BundlerOutputArg =
+      Args.MakeArgString(std::string("-outputs=").append(OutputFileName));
+  BundlerArgs.push_back(BundlerOutputArg);
+
+  SmallString<128> BundlerPath(C.getDriver().Dir);
+  llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
+  const char *Bundler = Args.MakeArgString(BundlerPath);
+  C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+}
+
 // For amdgcn the inputs of the linker job are device bitcode and output is
 // object file. It calls llvm-link, opt, llc, then lld steps.
 void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -192,6 +226,9 @@
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
 
+  if (JA.getType() == types::TY_HIP_FATBIN)
+    return constructHIPFatbinCommand(C, JA, Output.getFilename(), Inputs, Args, *this);
+
   assert(getToolChain().getTriple().getArch() == llvm::Triple::amdgcn &&
          "Unsupported target");
 
@@ -244,9 +281,9 @@
                          options::OPT_fno_cuda_approx_transcendentals, false))
     CC1Args.push_back("-fcuda-approx-transcendentals");
 
-  if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+  if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
                          false))
-    CC1Args.push_back("-fcuda-rdc");
+    CC1Args.push_back("-fgpu-rdc");
 
   // Default to "hidden" visibility, as object level linking will not be
   // supported for the foreseeable future.

diff --git a/clang/lib/Driver/ToolChains/HIP.h b/clang/lib/Driver/ToolChains/HIP.h
index 40c9128..3af19d4 100644
--- a/clang/lib/Driver/ToolChains/HIP.h
+++ b/clang/lib/Driver/ToolChains/HIP.h

@@ -19,6 +19,11 @@
 namespace tools {
 
 namespace AMDGCN {
+  // Construct command for creating HIP fatbin.
+  void constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
+                  StringRef OutputFileName, const InputInfoList &Inputs,
+                  const llvm::opt::ArgList &TCArgs, const Tool& T);
+
 // Runs llvm-link/opt/llc/lld, which links multiple LLVM bitcode, together with
 // device library, then compiles it to ISA in a shared object.
 class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
commit	9767089d003b52ec9d32b93c8533c815c9906902	[log] [tgz]
author	Yaxun Liu <Yaxun.Liu@amd.com>	Tue Oct 02 17:48:54 2018 +0000
committer	Yaxun Liu <Yaxun.Liu@amd.com>	Tue Oct 02 17:48:54 2018 +0000
tree	e9788366c4929488c5cc4c31054fa4ebc749b0f9
parent	2b5259afb3ce81334fab0f4d7a741b85f465caf7 [diff]