[cuda] Driver changes to compile and stitch together host and device-side CUDA code.
NOTE: reverts r242077 to reinstate r242058, r242065, 242067
and includes fix for OS X test failures.
- Changed driver pipeline to compile host and device side of CUDA
files and incorporate results of device-side compilation into host
object file.
- Added a test for cuda pipeline creation in clang driver.
New clang options:
--cuda-host-only - Do host-side compilation only.
--cuda-device-only - Do device-side compilation only.
--cuda-gpu-arch=<ARCH> - specify GPU architecture for device-side
compilation. E.g. sm_35, sm_30. Default is sm_20. May be used more
than once in which case one device-compilation will be done per
unique specified GPU architecture.
Differential Revision: http://reviews.llvm.org/D9509
llvm-svn: 242085
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index 360dbee..3219dc1 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -24,6 +24,8 @@
switch (AC) {
case InputClass: return "input";
case BindArchClass: return "bind-arch";
+ case CudaDeviceClass: return "cuda-device";
+ case CudaHostClass: return "cuda-host";
case PreprocessJobClass: return "preprocessor";
case PrecompileJobClass: return "precompiler";
case AnalyzeJobClass: return "analyzer";
@@ -53,6 +55,25 @@
const char *_ArchName)
: Action(BindArchClass, std::move(Input)), ArchName(_ArchName) {}
+void CudaDeviceAction::anchor() {}
+
+CudaDeviceAction::CudaDeviceAction(std::unique_ptr<Action> Input,
+ const char *ArchName, bool AtTopLevel)
+ : Action(CudaDeviceClass, std::move(Input)), GpuArchName(ArchName),
+ AtTopLevel(AtTopLevel) {}
+
+void CudaHostAction::anchor() {}
+
+CudaHostAction::CudaHostAction(std::unique_ptr<Action> Input,
+ const ActionList &_DeviceActions)
+ : Action(CudaHostClass, std::move(Input)), DeviceActions(_DeviceActions) {}
+
+CudaHostAction::~CudaHostAction() {
+ for (iterator it = DeviceActions.begin(), ie = DeviceActions.end(); it != ie;
+ ++it)
+ delete *it;
+}
+
void JobAction::anchor() {}
JobAction::JobAction(ActionClass Kind, std::unique_ptr<Action> Input,
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index f3ec151..180c412 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -174,8 +174,10 @@
} else if ((PhaseArg = DAL.getLastArg(options::OPT_S))) {
FinalPhase = phases::Backend;
- // -c only runs up to the assembler.
- } else if ((PhaseArg = DAL.getLastArg(options::OPT_c))) {
+ // -c and partial CUDA compilations only run up to the assembler.
+ } else if ((PhaseArg = DAL.getLastArg(options::OPT_c)) ||
+ (PhaseArg = DAL.getLastArg(options::OPT_cuda_device_only)) ||
+ (PhaseArg = DAL.getLastArg(options::OPT_cuda_host_only))) {
FinalPhase = phases::Assemble;
// Otherwise do everything.
@@ -900,9 +902,20 @@
} else if (BindArchAction *BIA = dyn_cast<BindArchAction>(A)) {
os << '"' << BIA->getArchName() << '"' << ", {"
<< PrintActions1(C, *BIA->begin(), Ids) << "}";
+ } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
+ os << '"' << CDA->getGpuArchName() << '"' << ", {"
+ << PrintActions1(C, *CDA->begin(), Ids) << "}";
} else {
+ ActionList *AL;
+ if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
+ os << "{" << PrintActions1(C, *CHA->begin(), Ids) << "}"
+ << ", gpu binaries ";
+ AL = &CHA->getDeviceActions();
+ } else
+ AL = &A->getInputs();
+
const char *Prefix = "{";
- for (Action *PreRequisite : *A) {
+ for (Action *PreRequisite : *AL) {
os << Prefix << PrintActions1(C, PreRequisite, Ids);
Prefix = ", ";
}
@@ -1215,6 +1228,93 @@
}
}
+// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE input
+// action and then wraps each in CudaDeviceAction paired with appropriate GPU
+// arch name. If we're only building device-side code, each action remains
+// independent. Otherwise we pass device-side actions as inputs to a new
+// CudaHostAction which combines both host and device side actions.
+static std::unique_ptr<Action>
+buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args,
+ const Arg *InputArg, const types::ID InputType,
+ std::unique_ptr<Action> Current, ActionList &Actions) {
+
+ assert(InputType == types::TY_CUDA &&
+ "CUDA Actions only apply to CUDA inputs.");
+
+ // Collect all cuda_gpu_arch parameters, removing duplicates.
+ SmallVector<const char *, 4> GpuArchList;
+ llvm::StringSet<> GpuArchNames;
+ for (Arg *A : Args) {
+ if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ)) {
+ A->claim();
+ if (GpuArchNames.insert(A->getValue()).second)
+ GpuArchList.push_back(A->getValue());
+ }
+ }
+
+ // Default to sm_20 which is the lowest common denominator for supported GPUs.
+ // sm_20 code should work correctly, if suboptimally, on all newer GPUs.
+ if (GpuArchList.empty())
+ GpuArchList.push_back("sm_20");
+
+ // Replicate inputs for each GPU architecture.
+ Driver::InputList CudaDeviceInputs;
+ for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
+ CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
+
+ // Build actions for all device inputs.
+ ActionList CudaDeviceActions;
+ D.BuildActions(TC, Args, CudaDeviceInputs, CudaDeviceActions);
+ assert(GpuArchList.size() == CudaDeviceActions.size() &&
+ "Failed to create actions for all devices");
+
+ // Check whether any of device actions stopped before they could generate PTX.
+ bool PartialCompilation = false;
+ bool DeviceOnlyCompilation = Args.hasArg(options::OPT_cuda_device_only);
+ for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) {
+ if (CudaDeviceActions[i]->getKind() != Action::BackendJobClass) {
+ PartialCompilation = true;
+ break;
+ }
+ }
+
+ // Figure out what to do with device actions -- pass them as inputs to the
+ // host action or run each of them independently.
+ if (PartialCompilation || DeviceOnlyCompilation) {
+ // In case of partial or device-only compilation results of device actions
+ // are not consumed by the host action device actions have to be added to
+ // top-level actions list with AtTopLevel=true and run independently.
+
+ // -o is ambiguous if we have more than one top-level action.
+ if (Args.hasArg(options::OPT_o) &&
+ (!DeviceOnlyCompilation || GpuArchList.size() > 1)) {
+ D.Diag(clang::diag::err_drv_output_argument_with_multiple_files);
+ return nullptr;
+ }
+
+ for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
+ Actions.push_back(
+ new CudaDeviceAction(std::unique_ptr<Action>(CudaDeviceActions[i]),
+ GpuArchList[i], /* AtTopLevel */ true));
+ // Kill host action in case of device-only compilation.
+ if (DeviceOnlyCompilation)
+ Current.reset(nullptr);
+ return Current;
+ } else {
+ // Outputs of device actions during complete CUDA compilation get created
+ // with AtTopLevel=false and become inputs for the host action.
+ ActionList DeviceActions;
+ for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
+ DeviceActions.push_back(
+ new CudaDeviceAction(std::unique_ptr<Action>(CudaDeviceActions[i]),
+ GpuArchList[i], /* AtTopLevel */ false));
+ // Return a new host action that incorporates original host action and all
+ // device actions.
+ return std::unique_ptr<Action>(
+ new CudaHostAction(std::move(Current), DeviceActions));
+ }
+}
+
void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
const InputList &Inputs, ActionList &Actions) const {
llvm::PrettyStackTraceString CrashInfo("Building compilation actions");
@@ -1312,6 +1412,25 @@
continue;
}
+ phases::ID CudaInjectionPhase;
+ if (isSaveTempsEnabled()) {
+ // All phases are done independently, inject GPU blobs during compilation
+ // phase as that's where we generate glue code to init them.
+ CudaInjectionPhase = phases::Compile;
+ } else {
+ // Assumes that clang does everything up until linking phase, so we inject
+ // cuda device actions at the last step before linking. Otherwise CUDA
+ // host action forces preprocessor into a separate invocation.
+ if (FinalPhase == phases::Link) {
+ for (auto i = PL.begin(), e = PL.end(); i != e; ++i) {
+ auto next = i + 1;
+ if (next != e && *next == phases::Link)
+ CudaInjectionPhase = *i;
+ }
+ } else
+ CudaInjectionPhase = FinalPhase;
+ }
+
// Build the pipeline for this file.
std::unique_ptr<Action> Current(new InputAction(*InputArg, InputType));
for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
@@ -1337,6 +1456,15 @@
// Otherwise construct the appropriate action.
Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current));
+
+ if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase &&
+ !Args.hasArg(options::OPT_cuda_host_only)) {
+ Current = buildCudaActions(*this, TC, Args, InputArg, InputType,
+ std::move(Current), Actions);
+ if (!Current)
+ break;
+ }
+
if (Current->getType() == types::TY_Nothing)
break;
}
@@ -1576,7 +1704,13 @@
if (isa<BackendJobAction>(JA)) {
// Check if the compiler supports emitting LLVM IR.
assert(Inputs->size() == 1);
- JobAction *CompileJA = cast<CompileJobAction>(*Inputs->begin());
+ JobAction *CompileJA;
+ // Extract real host action, if it's a CudaHostAction.
+ if (CudaHostAction *CudaHA = dyn_cast<CudaHostAction>(*Inputs->begin()))
+ CompileJA = cast<CompileJobAction>(*CudaHA->begin());
+ else
+ CompileJA = cast<CompileJobAction>(*Inputs->begin());
+
const Tool *Compiler = TC->SelectTool(*CompileJA);
if (!Compiler)
return nullptr;
@@ -1610,6 +1744,20 @@
InputInfo &Result) const {
llvm::PrettyStackTraceString CrashInfo("Building compilation jobs");
+ InputInfoList CudaDeviceInputInfos;
+ if (const CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
+ InputInfo II;
+ // Append outputs of device jobs to the input list.
+ for (const Action *DA : CHA->getDeviceActions()) {
+ BuildJobsForAction(C, DA, TC, "", AtTopLevel,
+ /*MultipleArchs*/ false, LinkingOutput, II);
+ CudaDeviceInputInfos.push_back(II);
+ }
+ // Override current action with a real host compile action and continue
+ // processing it.
+ A = *CHA->begin();
+ }
+
if (const InputAction *IA = dyn_cast<InputAction>(A)) {
// FIXME: It would be nice to not claim this here; maybe the old scheme of
// just using Args was better?
@@ -1635,11 +1783,24 @@
else
TC = &C.getDefaultToolChain();
- BuildJobsForAction(C, *BAA->begin(), TC, BAA->getArchName(), AtTopLevel,
+ BuildJobsForAction(C, *BAA->begin(), TC, ArchName, AtTopLevel,
MultipleArchs, LinkingOutput, Result);
return;
}
+ if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
+ // Figure out which NVPTX triple to use for device-side compilation based on
+ // whether host is 64-bit.
+ llvm::Triple DeviceTriple(C.getDefaultToolChain().getTriple().isArch64Bit()
+ ? "nvptx64-nvidia-cuda"
+ : "nvptx-nvidia-cuda");
+ BuildJobsForAction(C, *CDA->begin(),
+ &getToolChain(C.getArgs(), DeviceTriple),
+ CDA->getGpuArchName(), CDA->isAtTopLevel(),
+ /*MultipleArchs*/ true, LinkingOutput, Result);
+ return;
+ }
+
const ActionList *Inputs = &A->getInputs();
const JobAction *JA = cast<JobAction>(A);
@@ -1671,6 +1832,10 @@
if (JA->getType() == types::TY_dSYM)
BaseInput = InputInfos[0].getFilename();
+ // Append outputs of cuda device jobs to the input list
+ if (CudaDeviceInputInfos.size())
+ InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end());
+
// Determine the place to write output to, if any.
if (JA->getType() == types::TY_Nothing)
Result = InputInfo(A->getType(), BaseInput);
@@ -2052,6 +2217,9 @@
break;
}
break;
+ case llvm::Triple::CUDA:
+ TC = new toolchains::CudaToolChain(*this, Target, Args);
+ break;
default:
// Of these targets, Hexagon is the only one that might have
// an OS of Linux, in which case it got handled above already.
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index da020a2..e6a1bc9 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -151,6 +151,8 @@
case Action::InputClass:
case Action::BindArchClass:
+ case Action::CudaDeviceClass:
+ case Action::CudaHostClass:
case Action::LipoJobClass:
case Action::DsymutilJobClass:
case Action::VerifyDebugInfoJobClass:
diff --git a/clang/lib/Driver/ToolChains.cpp b/clang/lib/Driver/ToolChains.cpp
index eafc72b..15e36a1 100644
--- a/clang/lib/Driver/ToolChains.cpp
+++ b/clang/lib/Driver/ToolChains.cpp
@@ -3652,6 +3652,65 @@
return new tools::dragonfly::Linker(*this);
}
+/// Stub for CUDA toolchain. At the moment we don't have assembler or
+/// linker and need toolchain mainly to propagate device-side options
+/// to CC1.
+
+CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
+ const ArgList &Args)
+ : Linux(D, Triple, Args) {}
+
+void
+CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+ llvm::opt::ArgStringList &CC1Args) const {
+ Linux::addClangTargetOptions(DriverArgs, CC1Args);
+ CC1Args.push_back("-fcuda-is-device");
+}
+
+llvm::opt::DerivedArgList *
+CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
+ const char *BoundArch) const {
+ DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs());
+ const OptTable &Opts = getDriver().getOpts();
+
+ for (Arg *A : Args) {
+ if (A->getOption().matches(options::OPT_Xarch__)) {
+ // Skip this argument unless the architecture matches BoundArch
+ if (A->getValue(0) != StringRef(BoundArch))
+ continue;
+
+ unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
+ unsigned Prev = Index;
+ std::unique_ptr<Arg> XarchArg(Opts.ParseOneArg(Args, Index));
+
+ // If the argument parsing failed or more than one argument was
+ // consumed, the -Xarch_ argument's parameter tried to consume
+ // extra arguments. Emit an error and ignore.
+ //
+ // We also want to disallow any options which would alter the
+ // driver behavior; that isn't going to work in our model. We
+ // use isDriverOption() as an approximation, although things
+ // like -O4 are going to slip through.
+ if (!XarchArg || Index > Prev + 1) {
+ getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args)
+ << A->getAsString(Args);
+ continue;
+ } else if (XarchArg->getOption().hasFlag(options::DriverOption)) {
+ getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver)
+ << A->getAsString(Args);
+ continue;
+ }
+ XarchArg->setBaseArg(A);
+ A = XarchArg.release();
+ DAL->AddSynthesizedArg(A);
+ }
+ DAL->append(A);
+ }
+
+ DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
+ return DAL;
+}
+
/// XCore tool chain
XCore::XCore(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
: ToolChain(D, Triple, Args) {
diff --git a/clang/lib/Driver/ToolChains.h b/clang/lib/Driver/ToolChains.h
index 3689682..327ff9b 100644
--- a/clang/lib/Driver/ToolChains.h
+++ b/clang/lib/Driver/ToolChains.h
@@ -699,6 +699,18 @@
std::string computeSysRoot() const;
};
+class LLVM_LIBRARY_VISIBILITY CudaToolChain : public Linux {
+public:
+ CudaToolChain(const Driver &D, const llvm::Triple &Triple,
+ const llvm::opt::ArgList &Args);
+
+ llvm::opt::DerivedArgList *
+ TranslateArgs(const llvm::opt::DerivedArgList &Args,
+ const char *BoundArch) const override;
+ void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+ llvm::opt::ArgStringList &CC1Args) const override;
+};
+
class LLVM_LIBRARY_VISIBILITY Hexagon_TC : public Linux {
protected:
GCCVersion GCCLibAndIncVersion;
diff --git a/clang/lib/Driver/Tools.cpp b/clang/lib/Driver/Tools.cpp
index bf9b4ba..c6dc178 100644
--- a/clang/lib/Driver/Tools.cpp
+++ b/clang/lib/Driver/Tools.cpp
@@ -1488,6 +1488,12 @@
return CPUName;
}
+ case llvm::Triple::nvptx:
+ case llvm::Triple::nvptx64:
+ if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
+ return A->getValue();
+ return "";
+
case llvm::Triple::ppc:
case llvm::Triple::ppc64:
case llvm::Triple::ppc64le: {
@@ -2826,8 +2832,14 @@
getToolChain().getTriple().isWindowsCygwinEnvironment();
bool IsWindowsMSVC = getToolChain().getTriple().isWindowsMSVCEnvironment();
- assert(Inputs.size() == 1 && "Unable to handle multiple inputs.");
+ // Check number of inputs for sanity. We need at least one input.
+ assert(Inputs.size() >= 1 && "Must have at least one input.");
const InputInfo &Input = Inputs[0];
+ // CUDA compilation may have multiple inputs (source file + results of
+ // device-side compilations). All other jobs are expected to have exactly one
+ // input.
+ bool IsCuda = types::isCuda(Input.getType());
+ assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs.");
// Invoke ourselves in -cc1 mode.
//
@@ -4812,14 +4824,12 @@
assert(Output.isNothing() && "Invalid output.");
}
- for (const auto &II : Inputs) {
- addDashXForInput(Args, II, CmdArgs);
+ addDashXForInput(Args, Input, CmdArgs);
- if (II.isFilename())
- CmdArgs.push_back(II.getFilename());
- else
- II.getInputArg().renderAsInput(Args, CmdArgs);
- }
+ if (Input.isFilename())
+ CmdArgs.push_back(Input.getFilename());
+ else
+ Input.getInputArg().renderAsInput(Args, CmdArgs);
Args.AddAllArgs(CmdArgs, options::OPT_undef);
@@ -4857,6 +4867,16 @@
CmdArgs.push_back(SplitDwarfOut);
}
+ // Host-side cuda compilation receives device-side outputs as Inputs[1...].
+ // Include them with -fcuda-include-gpubinary.
+ if (IsCuda && Inputs.size() > 1)
+ for (InputInfoList::const_iterator it = std::next(Inputs.begin()),
+ ie = Inputs.end();
+ it != ie; ++it) {
+ CmdArgs.push_back("-fcuda-include-gpubinary");
+ CmdArgs.push_back(it->getFilename());
+ }
+
// Finally add the compile command to the compilation.
if (Args.hasArg(options::OPT__SLASH_fallback) &&
Output.getType() == types::TY_Object &&
diff --git a/clang/lib/Driver/Types.cpp b/clang/lib/Driver/Types.cpp
index 7b28145..2085b01 100644
--- a/clang/lib/Driver/Types.cpp
+++ b/clang/lib/Driver/Types.cpp
@@ -86,6 +86,7 @@
case TY_C: case TY_PP_C:
case TY_CL:
case TY_CUDA: case TY_PP_CUDA:
+ case TY_CUDA_DEVICE:
case TY_ObjC: case TY_PP_ObjC: case TY_PP_ObjC_Alias:
case TY_CXX: case TY_PP_CXX:
case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias:
@@ -122,7 +123,19 @@
case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias:
case TY_CXXHeader: case TY_PP_CXXHeader:
case TY_ObjCXXHeader: case TY_PP_ObjCXXHeader:
- case TY_CUDA: case TY_PP_CUDA:
+ case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE:
+ return true;
+ }
+}
+
+bool types::isCuda(ID Id) {
+ switch (Id) {
+ default:
+ return false;
+
+ case TY_CUDA:
+ case TY_PP_CUDA:
+ case TY_CUDA_DEVICE:
return true;
}
}
@@ -206,10 +219,12 @@
P.push_back(phases::Compile);
P.push_back(phases::Backend);
}
- P.push_back(phases::Assemble);
+ if (Id != TY_CUDA_DEVICE)
+ P.push_back(phases::Assemble);
}
}
- if (!onlyPrecompileType(Id)) {
+
+ if (!onlyPrecompileType(Id) && Id != TY_CUDA_DEVICE) {
P.push_back(phases::Link);
}
assert(0 < P.size() && "Not enough phases in list");