[OpenMP] Add flag for specifying the target device architecture for OpenMP device offloading

Summary:
OpenMP has the ability to offload target regions to devices which may have different architectures.

A new -fopenmp-target-arch flag is introduced to specify the device architecture.

In this patch I use the new flag to specify the compute capability of the underlying NVIDIA architecture for the OpenMP offloading CUDA tool chain.

Only a host-offloading test is provided since full device offloading capability will only be available when [[ https://reviews.llvm.org/D29654 | D29654 ]] lands.

Reviewers: hfinkel, Hahnfeld, carlo.bertolli, caomhin, ABataev

Reviewed By: hfinkel

Subscribers: guansong, cfe-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D34784

llvm-svn: 310263
diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp
index cf86644..6626662 100644
--- a/clang/lib/Driver/Compilation.cpp
+++ b/clang/lib/Driver/Compilation.cpp
@@ -59,7 +59,12 @@
 
   DerivedArgList *&Entry = TCArgs[{TC, BoundArch, DeviceOffloadKind}];
   if (!Entry) {
-    Entry = TC->TranslateArgs(*TranslatedArgs, BoundArch, DeviceOffloadKind);
+    // Translate OpenMP toolchain arguments provided via the -Xopenmp-target flags.
+    Entry = TC->TranslateOpenMPTargetArgs(*TranslatedArgs, DeviceOffloadKind);
+    if (!Entry)
+      Entry = TranslatedArgs;
+
+    Entry = TC->TranslateArgs(*Entry, BoundArch, DeviceOffloadKind);
     if (!Entry)
       Entry = TranslatedArgs;
   }
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 1e446dc..94d5e4f 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -795,3 +795,65 @@
 
   return VersionTuple();
 }
+
+llvm::opt::DerivedArgList *
+ToolChain::TranslateOpenMPTargetArgs(const llvm::opt::DerivedArgList &Args,
+    Action::OffloadKind DeviceOffloadKind) const {
+  if (DeviceOffloadKind == Action::OFK_OpenMP) {
+    DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs());
+    const OptTable &Opts = getDriver().getOpts();
+
+    // Handle -Xopenmp-target flags
+    for (Arg *A : Args) {
+      // Exclude flags which may only apply to the host toolchain.
+      // Do not exclude flags when the host triple (AuxTriple),
+      // matches the current toolchain triple.
+      if (A->getOption().matches(options::OPT_m_Group)) {
+        if (getAuxTriple() && getAuxTriple()->str() == getTriple().str())
+          DAL->append(A);
+        continue;
+      }
+
+      unsigned Index;
+      unsigned Prev;
+      bool XOpenMPTargetNoTriple = A->getOption().matches(
+          options::OPT_Xopenmp_target);
+
+      if (A->getOption().matches(options::OPT_Xopenmp_target_EQ)) {
+        // Passing device args: -Xopenmp-target=<triple> -opt=val.
+        if (A->getValue(0) == getTripleString())
+          Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
+        else
+          continue;
+      } else if (XOpenMPTargetNoTriple) {
+        // Passing device args: -Xopenmp-target -opt=val.
+        Index = Args.getBaseArgs().MakeIndex(A->getValue(0));
+      } else {
+        DAL->append(A);
+        continue;
+      }
+
+      // Parse the argument to -Xopenmp-target.
+      Prev = Index;
+      std::unique_ptr<Arg> XOpenMPTargetArg(Opts.ParseOneArg(Args, Index));
+      if (!XOpenMPTargetArg || Index > Prev + 1) {
+        getDriver().Diag(diag::err_drv_invalid_Xopenmp_target_with_args)
+            << A->getAsString(Args);
+        continue;
+      }
+      if (XOpenMPTargetNoTriple && XOpenMPTargetArg &&
+          Args.getAllArgValues(
+              options::OPT_fopenmp_targets_EQ).size() != 1) {
+        getDriver().Diag(diag::err_drv_Xopenmp_target_missing_triple);
+        continue;
+      }
+      XOpenMPTargetArg->setBaseArg(A);
+      A = XOpenMPTargetArg.release();
+      DAL->append(A);
+    }
+
+    return DAL;
+  }
+
+  return nullptr;
+}
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 935a5a3..86be187 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -212,8 +212,18 @@
       static_cast<const toolchains::CudaToolChain &>(getToolChain());
   assert(TC.getTriple().isNVPTX() && "Wrong platform");
 
+  StringRef GPUArchName;
+  // If this is an OpenMP action we need to extract the device architecture
+  // from the -march=arch option. This option may come from -Xopenmp-target
+  // flag or the default value.
+  if (JA.isDeviceOffloading(Action::OFK_OpenMP)) {
+    GPUArchName = Args.getLastArgValue(options::OPT_march_EQ);
+    assert(!GPUArchName.empty() && "Must have an architecture passed in.");
+  } else
+    GPUArchName = JA.getOffloadingArch();
+
   // Obtain architecture from the action.
-  CudaArch gpu_arch = StringToCudaArch(JA.getOffloadingArch());
+  CudaArch gpu_arch = StringToCudaArch(GPUArchName);
   assert(gpu_arch != CudaArch::UNKNOWN &&
          "Device action expected to have an architecture.");
 
@@ -405,7 +415,7 @@
 
   // For OpenMP device offloading, append derived arguments. Make sure
   // flags are not duplicated.
-  // TODO: Append the compute capability.
+  // Also append the compute capability.
   if (DeviceOffloadKind == Action::OFK_OpenMP) {
     for (Arg *A : Args){
       bool IsDuplicate = false;
@@ -418,6 +428,13 @@
       if (!IsDuplicate)
         DAL->append(A);
     }
+
+    StringRef Arch = DAL->getLastArgValue(options::OPT_march_EQ);
+    if (Arch.empty())
+      // Default compute capability for CUDA toolchain is sm_20.
+      DAL->AddJoinedArg(nullptr,
+          Opts.getOption(options::OPT_march_EQ), "sm_20");
+
     return DAL;
   }