[CUDA][OpenMP] Create generic offload toolchains

Summary:
This patch introduces the concept of offloading tool chain and offloading kind. Each tool chain may have associated an offloading kind that marks it as used in a given programming model that requires offloading. 

It also adds the logic to iterate on the tool chains based on the kind. Currently, only CUDA is supported, but in general a programming model (an offloading kind) may have associated multiple tool chains that require supporting offloading.

This patch does not add tests - its goal is to keep the existing functionality.

This patch is the first of a series of three that attempts to make the current support of CUDA more generic and easier to extend to other programming models, namely OpenMP. It tries to capture the suggestions/improvements/concerns on the initial proposal in  http://lists.llvm.org/pipermail/cfe-dev/2016-February/047547.html. It only tackles the more consensual part of the proposal, i.e.does not address the problem of intermediate files bundling yet.

Reviewers: ABataev, jlebar, echristo, hfinkel, tra

Subscribers: guansong, Hahnfeld, andreybokhanko, tcramer, mkuron, cfe-commits, arpith-jacob, carlo.bertolli, caomhin

Differential Revision: http://reviews.llvm.org/D18170

llvm-svn: 272571
diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp
index b24d381..46548e6 100644
--- a/clang/lib/Driver/Compilation.cpp
+++ b/clang/lib/Driver/Compilation.cpp
@@ -24,10 +24,13 @@
 
 Compilation::Compilation(const Driver &D, const ToolChain &_DefaultToolChain,
                          InputArgList *_Args, DerivedArgList *_TranslatedArgs)
-    : TheDriver(D), DefaultToolChain(_DefaultToolChain),
-      CudaHostToolChain(&DefaultToolChain), CudaDeviceToolChain(nullptr),
+    : TheDriver(D), DefaultToolChain(_DefaultToolChain), ActiveOffloadMask(0u),
       Args(_Args), TranslatedArgs(_TranslatedArgs), Redirects(nullptr),
-      ForDiagnostics(false) {}
+      ForDiagnostics(false) {
+  // The offloading host toolchain is the default tool chain.
+  OrderedOffloadingToolchains.insert(
+      std::make_pair(Action::OFK_Host, &DefaultToolChain));
+}
 
 Compilation::~Compilation() {
   delete TranslatedArgs;
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index f7e1530..796fd64 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -422,6 +422,31 @@
   }
 }
 
+void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
+                                              InputList &Inputs) {
+
+  //
+  // CUDA
+  //
+  // We need to generate a CUDA toolchain if any of the inputs has a CUDA type.
+  if (llvm::any_of(Inputs, [](std::pair<types::ID, const llvm::opt::Arg *> &I) {
+        return types::isCuda(I.first);
+      })) {
+    const ToolChain &TC = getToolChain(
+        C.getInputArgs(),
+        llvm::Triple(C.getOffloadingHostToolChain()->getTriple().isArch64Bit()
+                         ? "nvptx64-nvidia-cuda"
+                         : "nvptx-nvidia-cuda"));
+    C.addOffloadDeviceToolChain(&TC, Action::OFK_Cuda);
+  }
+
+  //
+  // TODO: Add support for other offloading programming models here.
+  //
+
+  return;
+}
+
 Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
   llvm::PrettyStackTraceString CrashInfo("Compilation construction");
 
@@ -549,18 +574,8 @@
   InputList Inputs;
   BuildInputs(C->getDefaultToolChain(), *TranslatedArgs, Inputs);
 
-  // Initialize the CUDA device TC only if we have any CUDA Inputs.  This is
-  // necessary so that we don't break compilations that pass flags that are
-  // incompatible with the NVPTX TC (e.g. -mthread-model single).
-  if (llvm::any_of(Inputs, [](const std::pair<types::ID, const Arg *> &I) {
-        return I.first == types::TY_CUDA || I.first == types::TY_PP_CUDA ||
-               I.first == types::TY_CUDA_DEVICE;
-      })) {
-    C->setCudaDeviceToolChain(
-        &getToolChain(C->getArgs(), llvm::Triple(TC.getTriple().isArch64Bit()
-                                                     ? "nvptx64-nvidia-cuda"
-                                                     : "nvptx-nvidia-cuda")));
-  }
+  // Populate the tool chains for the offloading devices, if any.
+  CreateOffloadingDeviceToolChains(*C, Inputs);
 
   // Construct the list of abstract actions to perform for this compilation. On
   // MachO targets this uses the driver-driver and universal actions.
@@ -1390,7 +1405,7 @@
     CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
 
   // Build actions for all device inputs.
-  assert(C.getCudaDeviceToolChain() &&
+  assert(C.getSingleOffloadToolChain<Action::OFK_Cuda>() &&
          "Missing toolchain for device-side compilation.");
   ActionList CudaDeviceActions;
   C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions);
@@ -2031,7 +2046,7 @@
     // Initial processing of CudaDeviceAction carries host params.
     // Call BuildJobsForAction() again, now with correct device parameters.
     InputInfo II = BuildJobsForAction(
-        C, *CDA->input_begin(), C.getCudaDeviceToolChain(),
+        C, *CDA->input_begin(), C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
         CDA->getGpuArchName(), CDA->isAtTopLevel(), /*MultipleArchs=*/true,
         LinkingOutput, CachedResults);
     // Currently II's Action is *CDA->input_begin().  Set it to CDA instead, so
diff --git a/clang/lib/Driver/Tools.cpp b/clang/lib/Driver/Tools.cpp
index 9b715e3..e73c22d 100644
--- a/clang/lib/Driver/Tools.cpp
+++ b/clang/lib/Driver/Tools.cpp
@@ -3767,10 +3767,10 @@
     // particular compilation pass we're constructing here. For now we
     // can check which toolchain we're using and pick the other one to
     // extract the triple.
-    if (&getToolChain() == C.getCudaDeviceToolChain())
-      AuxToolChain = C.getCudaHostToolChain();
-    else if (&getToolChain() == C.getCudaHostToolChain())
-      AuxToolChain = C.getCudaDeviceToolChain();
+    if (&getToolChain() == C.getSingleOffloadToolChain<Action::OFK_Cuda>())
+      AuxToolChain = C.getOffloadingHostToolChain();
+    else if (&getToolChain() == C.getOffloadingHostToolChain())
+      AuxToolChain = C.getSingleOffloadToolChain<Action::OFK_Cuda>();
     else
       llvm_unreachable("Can't figure out CUDA compilation mode.");
     assert(AuxToolChain != nullptr && "No aux toolchain.");