[CUDA] Invoke ptxas and fatbinary during compilation.

Summary:
Previously we compiled CUDA device code to PTX assembly and embedded
that asm as text in our host binary.  Now we compile to PTX assembly and
then invoke ptxas to assemble the PTX into a cubin file.  We gather the
ptx and cubin files for each of our --cuda-gpu-archs and combine them
using fatbinary, and then embed that into the host binary.

Adds two new command-line flags, -Xcuda_ptxas and -Xcuda_fatbinary,
which pass args down to the external tools.

Reviewers: tra, echristo

Subscribers: cfe-commits, jhen

Differential Revision: http://reviews.llvm.org/D16082

llvm-svn: 257809
diff --git a/clang/lib/Driver/Tools.cpp b/clang/lib/Driver/Tools.cpp
index 5e4777b..e498f98 100644
--- a/clang/lib/Driver/Tools.cpp
+++ b/clang/lib/Driver/Tools.cpp
@@ -10625,3 +10625,81 @@
   else
     ConstructGoldLinkJob(*this, C, JA, Output, Inputs, Args, LinkingOutput);
 }
+
+void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
+                                    const InputInfo &Output,
+                                    const InputInfoList &Inputs,
+                                    const ArgList &Args,
+                                    const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getArch() == llvm::Triple::nvptx ||
+         TC.getArch() == llvm::Triple::nvptx64);
+
+  std::vector<std::string> gpu_archs =
+      Args.getAllArgValues(options::OPT_march_EQ);
+  assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas.");
+  const std::string& gpu_arch = gpu_archs[0];
+
+
+  ArgStringList CmdArgs;
+  CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");
+
+  // Clang's default optimization level is -O0, but ptxas's default is -O3.
+  CmdArgs.push_back(Args.MakeArgString(
+      llvm::Twine("-O") +
+      Args.getLastArgValue(options::OPT_O_Group, "0").data()));
+
+  // Don't bother passing -g to ptxas: It's enabled by default at -O0, and
+  // not supported at other optimization levels.
+
+  CmdArgs.push_back("--gpu-name");
+  CmdArgs.push_back(Args.MakeArgString(gpu_arch));
+  CmdArgs.push_back("--output-file");
+  CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+  for (const auto& II : Inputs)
+    CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
+
+  for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
+    CmdArgs.push_back(Args.MakeArgString(A));
+
+  const char *Exec = Args.MakeArgString(TC.GetProgramPath("ptxas"));
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
+
+// All inputs to this linker must be from CudaDeviceActions, as we need to look
+// at the Inputs' Actions in order to figure out which GPU architecture they
+// correspond to.
+void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
+                                 const InputInfo &Output,
+                                 const InputInfoList &Inputs,
+                                 const ArgList &Args,
+                                 const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getArch() == llvm::Triple::nvptx ||
+         TC.getArch() == llvm::Triple::nvptx64);
+
+  ArgStringList CmdArgs;
+  CmdArgs.push_back("--cuda");
+  CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32");
+  CmdArgs.push_back(Args.MakeArgString("--create"));
+  CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+
+  for (const auto& II : Inputs) {
+    auto* A = cast<const CudaDeviceAction>(II.getAction());
+    // We need to pass an Arch of the form "sm_XX" for cubin files and
+    // "compute_XX" for ptx.
+    const char *Arch = (II.getType() == types::TY_PP_Asm)
+                           ? A->getComputeArchName()
+                           : A->getGpuArchName();
+    CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
+                                         Arch + ",file=" + II.getFilename()));
+  }
+
+  for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))
+    CmdArgs.push_back(Args.MakeArgString(A));
+
+  const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}