sfantao created this revision. sfantao added reviewers: ABataev, jlebar, tra, echristo, hfinkel. sfantao added subscribers: caomhin, carlo.bertolli, arpith-jacob, cfe-commits.
This patch replaces the CUDA specific action by a generic offload action. The offload action may have multiple dependences classier in “host” and “device”. The way this generic offloading action is used is very similar to what is done today by the CUDA implementation: it is used to set a specific toolchain and architecture to its dependences during the generation of jobs. This patch also proposes propagating the offloading information through the action graph so that that information can be easily retrieved at any time during the generation of commands. This allows e.g. the "clang tool” to evaluate whether CUDA should be supported for the device or host and ptas to easily retrieve the target architecture. This is an example of how the action graphs would look like (compilation of a single CUDA file with two GPU architectures) ``` 0: input, "cudatests.cu", cuda, (host-cuda) 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) 2: compiler, {1}, ir, (host-cuda) 3: input, "cudatests.cu", cuda, (device-cuda, sm_35) 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_35) 5: compiler, {4}, ir, (device-cuda, sm_35) 6: backend, {5}, assembler, (device-cuda, sm_35) 7: assembler, {6}, object, (device-cuda, sm_35) 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {7}, object 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {6}, assembler 10: input, "cudatests.cu", cuda, (device-cuda, sm_37) 11: preprocessor, {10}, cuda-cpp-output, (device-cuda, sm_37) 12: compiler, {11}, ir, (device-cuda, sm_37) 13: backend, {12}, assembler, (device-cuda, sm_37) 14: assembler, {13}, object, (device-cuda, sm_37) 15: offload, "device-cuda (nvptx64-nvidia-cuda:sm_37)" {14}, object 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_37)" {13}, assembler 17: linker, {8, 9, 15, 16}, cuda-fatbin, (device-cuda) 18: offload, "host-cuda (powerpc64le-unknown-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {17}, ir 19: backend, {18}, assembler 20: assembler, {19}, object 21: input, "cuda", object 22: input, "cudart", object 23: linker, {20, 21, 22}, image ``` The changes in this patch pass the existent regression tests (keeps the existent functionality) and resulting binaries execute correctly in a Power8+K40 machine. http://reviews.llvm.org/D18171 Files: include/clang/Driver/Action.h include/clang/Driver/Driver.h lib/Driver/Action.cpp lib/Driver/Driver.cpp lib/Driver/ToolChain.cpp lib/Driver/ToolChains.cpp lib/Driver/ToolChains.h lib/Driver/Tools.cpp lib/Frontend/CreateInvocationFromCommandLine.cpp
Index: lib/Frontend/CreateInvocationFromCommandLine.cpp =================================================================== --- lib/Frontend/CreateInvocationFromCommandLine.cpp +++ lib/Frontend/CreateInvocationFromCommandLine.cpp @@ -60,25 +60,25 @@ } // We expect to get back exactly one command job, if we didn't something - // failed. CUDA compilation is an exception as it creates multiple jobs. If - // that's the case, we proceed with the first job. If caller needs particular - // CUDA job, it should be controlled via --cuda-{host|device}-only option - // passed to the driver. + // failed. Offload compilation is an exception as it creates multiple jobs. If + // that's the case, we proceed with the first job. If caller needs a + // particular job, it should be controlled via options (e.g. + // --cuda-{host|device}-only for CUDA) passed to the driver. const driver::JobList &Jobs = C->getJobs(); - bool CudaCompilation = false; + bool OffloadCompilation = false; if (Jobs.size() > 1) { for (auto &A : C->getActions()){ // On MacOSX real actions may end up being wrapped in BindArchAction if (isa<driver::BindArchAction>(A)) A = *A->input_begin(); - if (isa<driver::CudaDeviceAction>(A)) { - CudaCompilation = true; + if (isa<driver::OffloadAction>(A)) { + OffloadCompilation = true; break; } } } if (Jobs.size() == 0 || !isa<driver::Command>(*Jobs.begin()) || - (Jobs.size() > 1 && !CudaCompilation)) { + (Jobs.size() > 1 && !OffloadCompilation)) { SmallString<256> Msg; llvm::raw_svector_ostream OS(Msg); Jobs.Print(OS, "; ", true); Index: lib/Driver/Tools.cpp =================================================================== --- lib/Driver/Tools.cpp +++ lib/Driver/Tools.cpp @@ -3565,7 +3565,7 @@ // CUDA compilation may have multiple inputs (source file + results of // device-side compilations). All other jobs are expected to have exactly one // input. - bool IsCuda = types::isCuda(Input.getType()); + bool IsCuda = JA.isOffloading(Action::OFFLOAD_CUDA); assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs."); // Invoke ourselves in -cc1 mode. @@ -3583,13 +3583,13 @@ // particular compilation pass we're constructing here. For now we // can check which toolchain we're using and pick the other one to // extract the triple. - if (&getToolChain() == - C.getSingleOffloadDeviceToolChain<Action::OFFLOAD_CUDA>()) + if (JA.isDeviceOffloading(Action::OFFLOAD_CUDA)) AuxToolChain = C.getOffloadingHostToolChain(); - else if (&getToolChain() == C.getOffloadingHostToolChain()) + else { + assert(C.isOffloadingHostKind(Action::OFFLOAD_CUDA) && + "Expecting CUDA host toolchain."); AuxToolChain = C.getSingleOffloadDeviceToolChain<Action::OFFLOAD_CUDA>(); - else - llvm_unreachable("Can't figure out CUDA compilation mode."); + } assert(AuxToolChain != nullptr && "No aux toolchain."); CmdArgs.push_back("-aux-triple"); CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str())); @@ -10883,10 +10883,9 @@ static_cast<const toolchains::CudaToolChain &>(getToolChain()); assert(TC.getTriple().isNVPTX() && "Wrong platform"); - std::vector<std::string> gpu_archs = - Args.getAllArgValues(options::OPT_march_EQ); - assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas."); - const std::string& gpu_arch = gpu_archs[0]; + // Obtain architecture from the action. + const char *gpu_arch = JA.getOffloadingArch(); + assert(gpu_arch && "Device action expected to have an architecture."); ArgStringList CmdArgs; CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32"); @@ -10960,12 +10959,19 @@ CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); for (const auto& II : Inputs) { - auto* A = cast<const CudaDeviceAction>(II.getAction()); + auto *A = II.getAction(); + assert(A->getInputs().size() == 1 && + "Device offload action is expected to have a single input"); + const char *gpu_arch = A->getOffloadingArch(); + assert(gpu_arch && + "Device action expected to have associated a GPU architecture!"); + // We need to pass an Arch of the form "sm_XX" for cubin files and // "compute_XX" for ptx. - const char *Arch = (II.getType() == types::TY_PP_Asm) - ? A->getComputeArchName() - : A->getGpuArchName(); + const char *Arch = + (II.getType() == types::TY_PP_Asm) + ? toolchains::CudaToolChain::GpuArchToComputeName(gpu_arch) + : gpu_arch; CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + Arch + ",file=" + II.getFilename())); } Index: lib/Driver/ToolChains.h =================================================================== --- lib/Driver/ToolChains.h +++ lib/Driver/ToolChains.h @@ -833,6 +833,11 @@ // ptxas. bool useIntegratedAs() const override { return false; } + // Converts CUDA GPU architecture, e.g. "sm_21", to its corresponding virtual + // compute arch, e.g. "compute_20". Returns null if the input arch is null or + // doesn't match an existing arch. + static const char *GpuArchToComputeName(const char *ArchName); + protected: Tool *buildAssembler() const override; // ptxas Tool *buildLinker() const override; // fatbinary (ok, not really a linker) Index: lib/Driver/ToolChains.cpp =================================================================== --- lib/Driver/ToolChains.cpp +++ lib/Driver/ToolChains.cpp @@ -4291,6 +4291,21 @@ return DAL; } +const char *CudaToolChain::GpuArchToComputeName(const char *ArchName) { + if (!ArchName) + return nullptr; + return llvm::StringSwitch<const char *>(ArchName) + .Cases("sm_20", "sm_21", "compute_20") + .Case("sm_30", "compute_30") + .Case("sm_32", "compute_32") + .Case("sm_35", "compute_35") + .Case("sm_37", "compute_37") + .Case("sm_50", "compute_50") + .Case("sm_52", "compute_52") + .Case("sm_53", "compute_53") + .Default(nullptr); +} + Tool *CudaToolChain::buildAssembler() const { return new tools::NVPTX::Assembler(*this); } Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -248,8 +248,7 @@ case Action::InputClass: case Action::BindArchClass: - case Action::CudaDeviceClass: - case Action::CudaHostClass: + case Action::OffloadClass: case Action::LipoJobClass: case Action::DsymutilJobClass: case Action::VerifyDebugInfoJobClass: Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -987,18 +987,33 @@ } else if (BindArchAction *BIA = dyn_cast<BindArchAction>(A)) { os << '"' << BIA->getArchName() << '"' << ", {" << PrintActions1(C, *BIA->input_begin(), Ids) << "}"; - } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) { - os << '"' - << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)") - << '"' << ", {" << PrintActions1(C, *CDA->input_begin(), Ids) << "}"; + } else if (OffloadAction *OA = dyn_cast<OffloadAction>(A)) { + bool IsFirst = true; + OA->doOnEachDependence( + [&](Action *A, const ToolChain *TC, const char *BoundArch) { + // E.g. for two CUDA device dependences whose bound arch is sm_20 and + // sm_35 this will generate: + // "cuda-device" (nvptx64-nvidia-cuda:sm_20) {#ID}, "cuda-device" + // (nvptx64-nvidia-cuda:sm_35) {#ID} + if (!IsFirst) + os << ", "; + os << '"'; + if (TC) + os << A->getOffloadingKindPrefix(); + else + os << "host"; + os << " ("; + os << TC->getTriple().normalize(); + + if (BoundArch) + os << ":" << BoundArch; + os << ")"; + os << '"'; + os << " {" << PrintActions1(C, A, Ids) << "}"; + IsFirst = false; + }); } else { - const ActionList *AL; - if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) { - os << "{" << PrintActions1(C, *CHA->input_begin(), Ids) << "}" - << ", gpu binaries "; - AL = &CHA->getDeviceActions(); - } else - AL = &A->getInputs(); + const ActionList *AL = &A->getInputs(); if (AL->size()) { const char *Prefix = "{"; @@ -1011,10 +1026,24 @@ os << "{}"; } + // Append offload info for all options other than the offloading action + // itself (e.g. (cuda-device, sm_20) or (cuda-host)). + std::string offload_str; + llvm::raw_string_ostream offload_os(offload_str); + if (!isa<OffloadAction>(A)) { + auto S = A->getOffloadingKindPrefix(); + if (!S.empty()) { + offload_os << ", (" << S; + if (A->getOffloadingArch()) + offload_os << ", " << A->getOffloadingArch(); + offload_os << ")"; + } + } + unsigned Id = Ids.size(); Ids[A] = Id; llvm::errs() << Id << ": " << os.str() << ", " - << types::getTypeName(A->getType()) << "\n"; + << types::getTypeName(A->getType()) << offload_os.str() << "\n"; return Id; } @@ -1327,8 +1356,12 @@ options::OPT_cuda_device_only); // Host-only compilation case. if (PartialCompilationArg && - PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only)) - return C.MakeAction<CudaHostAction>(HostAction, ActionList()); + PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only)) { + OffloadAction::HostDependence HDep( + HostAction, C.getOffloadingHostToolChain(), /*BoundArch=*/nullptr, + Action::OFFLOAD_CUDA); + return C.MakeAction<OffloadAction>(HDep); + } // Collect all cuda_gpu_arch parameters, removing duplicates. SmallVector<const char *, 4> GpuArchList; @@ -1339,7 +1372,7 @@ A->claim(); const auto& Arch = A->getValue(); - if (!CudaDeviceAction::IsValidGpuArchName(Arch)) + if (!toolchains::CudaToolChain::GpuArchToComputeName(Arch)) C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << Arch; else if (GpuArchNames.insert(Arch).second) GpuArchList.push_back(Arch); @@ -1355,9 +1388,11 @@ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg)); + const ToolChain *CudaTC = + C.getSingleOffloadDeviceToolChain<Action::OFFLOAD_CUDA>(); + // Build actions for all device inputs. - assert(C.getSingleOffloadDeviceToolChain<Action::OFFLOAD_CUDA>() && - "Missing toolchain for device-side compilation."); + assert(CudaTC && "Missing toolchain for device-side compilation."); ActionList CudaDeviceActions; C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions); assert(GpuArchList.size() == CudaDeviceActions.size() && @@ -1385,10 +1420,13 @@ return nullptr; } - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) - Actions.push_back(C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I], - GpuArchList[I], - /* AtTopLevel */ true)); + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + OffloadAction::DeviceDependences DDep; + DDep.add(CudaDeviceActions[I], CudaTC, GpuArchList[I], + Action::OFFLOAD_CUDA); + Actions.push_back( + C.MakeAction<OffloadAction>(DDep, CudaDeviceActions[I]->getType())); + } // Kill host action in case of device-only compilation. if (DeviceOnlyCompilation) return nullptr; @@ -1408,19 +1446,23 @@ Action* BackendAction = AssembleAction->getInputs()[0]; assert(BackendAction->getType() == types::TY_PP_Asm); - for (const auto& A : {AssembleAction, BackendAction}) { - DeviceActions.push_back(C.MakeAction<CudaDeviceAction>( - A, GpuArchList[I], /* AtTopLevel */ false)); + for (auto &A : {AssembleAction, BackendAction}) { + OffloadAction::DeviceDependences DDep; + DDep.add(A, CudaTC, GpuArchList[I], Action::OFFLOAD_CUDA); + DeviceActions.push_back(C.MakeAction<OffloadAction>(DDep, A->getType())); } } - auto FatbinAction = C.MakeAction<CudaDeviceAction>( - C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN), - /* GpuArchName = */ nullptr, - /* AtTopLevel = */ false); + auto FatbinAction = + C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN); + // Return a new host action that incorporates original host action and all // device actions. - return C.MakeAction<CudaHostAction>(std::move(HostAction), - ActionList({FatbinAction})); + OffloadAction::HostDependence HDep(HostAction, C.getOffloadingHostToolChain(), + /*BoundArch=*/nullptr, + Action::OFFLOAD_CUDA); + OffloadAction::DeviceDependences DDep; + DDep.add(FatbinAction, CudaTC, /*BoundArch=*/nullptr, Action::OFFLOAD_CUDA); + return C.MakeAction<OffloadAction>(HDep, DDep); } void Driver::BuildActions(Compilation &C, DerivedArgList &Args, @@ -1825,7 +1867,28 @@ } } } - +// Collapse an offloading action looking for a job of the given type. The input +// action is changed to the input of the collapsed sequence. If we effectively +// had a collapse return the corresponding offloading action, otherwise return +// null. +template <typename T> +static OffloadAction *collapseOffloadingAction(Action *&CurAction) { + if (!CurAction) + return nullptr; + if (auto *OA = dyn_cast<OffloadAction>(CurAction)) { + if (auto *HDep = OA->getHostDependence()) + if (isa<T>(HDep)) { + CurAction = HDep; + return OA; + } + if (auto *DDep = OA->getSingleDeviceDependence()) + if (isa<T>(DDep)) { + CurAction = DDep; + return OA; + } + } + return nullptr; +} // Returns a Tool for a given JobAction. In case the action and its // predecessors can be combined, updates Inputs with the inputs of the // first combined action. If one of the collapsed actions is a @@ -1835,34 +1898,39 @@ bool EmbedBitcode, const ToolChain *TC, const JobAction *JA, const ActionList *&Inputs, - const CudaHostAction *&CollapsedCHA) { + ActionList &CollapsedOffloadAction) { const Tool *ToolForJob = nullptr; - CollapsedCHA = nullptr; + CollapsedOffloadAction.clear(); // See if we should look for a compiler with an integrated assembler. We match // bottom up, so what we are actually looking for is an assembler job with a // compiler input. + // Look through offload actions between assembler and backend actions. + Action *BackendJA = (isa<AssembleJobAction>(JA) && Inputs->size() == 1) + ? *Inputs->begin() + : nullptr; + auto *BackendOA = collapseOffloadingAction<BackendJobAction>(BackendJA); + if (TC->useIntegratedAs() && !SaveTemps && !C.getArgs().hasArg(options::OPT_via_file_asm) && !C.getArgs().hasArg(options::OPT__SLASH_FA) && - !C.getArgs().hasArg(options::OPT__SLASH_Fa) && - isa<AssembleJobAction>(JA) && Inputs->size() == 1 && - isa<BackendJobAction>(*Inputs->begin())) { + !C.getArgs().hasArg(options::OPT__SLASH_Fa) && BackendJA && + isa<BackendJobAction>(BackendJA)) { // A BackendJob is always preceded by a CompileJob, and without -save-temps // or -fembed-bitcode, they will always get combined together, so instead of // checking the backend tool, check if the tool for the CompileJob has an // integrated assembler. For -fembed-bitcode, CompileJob is still used to // look up tools for BackendJob, but they need to match before we can split // them. - const ActionList *BackendInputs = &(*Inputs)[0]->getInputs(); - // Compile job may be wrapped in CudaHostAction, extract it if - // that's the case and update CollapsedCHA if we combine phases. - CudaHostAction *CHA = dyn_cast<CudaHostAction>(*BackendInputs->begin()); - JobAction *CompileJA = cast<CompileJobAction>( - CHA ? *CHA->input_begin() : *BackendInputs->begin()); - assert(CompileJA && "Backend job is not preceeded by compile job."); - const Tool *Compiler = TC->SelectTool(*CompileJA); + + // Look through offload actions between backend and compile actions. + Action *CompileJA = *BackendJA->getInputs().begin(); + auto *CompileOA = collapseOffloadingAction<CompileJobAction>(CompileJA); + + assert(CompileJA && isa<CompileJobAction>(CompileJA) && + "Backend job is not preceeded by compile job."); + const Tool *Compiler = TC->SelectTool(*cast<CompileJobAction>(CompileJA)); if (!Compiler) return nullptr; // When using -fembed-bitcode, it is required to have the same tool (clang) @@ -1876,7 +1944,15 @@ if (Compiler->hasIntegratedAssembler()) { Inputs = &CompileJA->getInputs(); ToolForJob = Compiler; - CollapsedCHA = CHA; + // Save the collapsed offload actions because they may still contain + // device action. Also propagate the offloading info of the inputs to the + // other action that are being integrated. + if (CompileOA) + CollapsedOffloadAction.push_back(CompileOA); + if (BackendOA) + CollapsedOffloadAction.push_back(BackendOA); + if (CompileOA || BackendOA) + JA->propagateOffloadInfo(CompileJA); } } @@ -1886,20 +1962,25 @@ if (isa<BackendJobAction>(JA)) { // Check if the compiler supports emitting LLVM IR. assert(Inputs->size() == 1); - // Compile job may be wrapped in CudaHostAction, extract it if - // that's the case and update CollapsedCHA if we combine phases. - CudaHostAction *CHA = dyn_cast<CudaHostAction>(*Inputs->begin()); - JobAction *CompileJA = - cast<CompileJobAction>(CHA ? *CHA->input_begin() : *Inputs->begin()); - assert(CompileJA && "Backend job is not preceeded by compile job."); - const Tool *Compiler = TC->SelectTool(*CompileJA); + + // Look through offload actions between backend and compile actions. + Action *CompileJA = *JA->getInputs().begin(); + auto *CompileOA = collapseOffloadingAction<CompileJobAction>(CompileJA); + + assert(CompileJA && isa<CompileJobAction>(CompileJA) && + "Backend job is not preceeded by compile job."); + const Tool *Compiler = TC->SelectTool(*cast<CompileJobAction>(CompileJA)); if (!Compiler) return nullptr; if (!Compiler->canEmitIR() || (!SaveTemps && !EmbedBitcode)) { Inputs = &CompileJA->getInputs(); ToolForJob = Compiler; - CollapsedCHA = CHA; + + if (CompileOA) { + CollapsedOffloadAction.push_back(CompileOA); + JA->propagateOffloadInfo(CompileJA); + } } } @@ -1910,12 +1991,23 @@ // See if we should use an integrated preprocessor. We do so when we have // exactly one input, since this is the only use case we care about // (irrelevant since we don't support combine yet). - if (Inputs->size() == 1 && isa<PreprocessJobAction>(*Inputs->begin()) && + + // Look through offload actions after preprocessing. + Action *PreprocessJA = (Inputs->size() == 1) ? *Inputs->begin() : nullptr; + auto *PreprocessOA = + collapseOffloadingAction<PreprocessJobAction>(PreprocessJA); + + if (PreprocessJA && isa<PreprocessJobAction>(PreprocessJA) && !C.getArgs().hasArg(options::OPT_no_integrated_cpp) && !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps && !C.getArgs().hasArg(options::OPT_rewrite_objc) && - ToolForJob->hasIntegratedCPP()) - Inputs = &(*Inputs)[0]->getInputs(); + ToolForJob->hasIntegratedCPP()) { + Inputs = &PreprocessJA->getInputs(); + if (PreprocessOA) { + CollapsedOffloadAction.push_back(PreprocessOA); + JA->propagateOffloadInfo(PreprocessJA); + } + } return ToolForJob; } @@ -1952,17 +2044,31 @@ const { llvm::PrettyStackTraceString CrashInfo("Building compilation jobs"); - InputInfoList CudaDeviceInputInfos; - if (const CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) { - // Append outputs of device jobs to the input list. - for (const Action *DA : CHA->getDeviceActions()) { - CudaDeviceInputInfos.push_back(BuildJobsForAction( - C, DA, TC, nullptr, AtTopLevel, - /*MultipleArchs*/ false, LinkingOutput, CachedResults)); + InputInfoList OffloadDeviceInputInfos; + if (const OffloadAction *OA = dyn_cast<OffloadAction>(A)) { + Action *HostAction = nullptr; + OA->doOnEachDeviceDependence( + [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + OffloadDeviceInputInfos.push_back(BuildJobsForAction( + C, DepA, DepTC, DepBoundArch, AtTopLevel, + /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults)); + }); + OA->doOnHostDependence( + [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + HostAction = DepA; + }); + + // If we have a single device action, just return its info. + if (!HostAction && OffloadDeviceInputInfos.size() == 1) { + return OffloadDeviceInputInfos.back(); } + + assert(HostAction && "Device actions are only expected to be used by the " + "host, not by each other."); + // Override current action with a real host compile action and continue // processing it. - A = *CHA->input_begin(); + A = HostAction; } if (const InputAction *IA = dyn_cast<InputAction>(A)) { @@ -1992,38 +2098,27 @@ MultipleArchs, LinkingOutput, CachedResults); } - if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) { - // Initial processing of CudaDeviceAction carries host params. - // Call BuildJobsForAction() again, now with correct device parameters. - InputInfo II = BuildJobsForAction( - C, *CDA->input_begin(), - C.getSingleOffloadDeviceToolChain<Action::OFFLOAD_CUDA>(), - CDA->getGpuArchName(), CDA->isAtTopLevel(), /*MultipleArchs=*/true, - LinkingOutput, CachedResults); - // Currently II's Action is *CDA->input_begin(). Set it to CDA instead, so - // that one can retrieve II's GPU arch. - II.setAction(A); - return II; - } const ActionList *Inputs = &A->getInputs(); const JobAction *JA = cast<JobAction>(A); - const CudaHostAction *CollapsedCHA = nullptr; + ActionList CollapsedOffloadActions; + const Tool *T = selectToolForJob(C, isSaveTempsEnabled(), embedBitcodeEnabled(), TC, JA, - Inputs, CollapsedCHA); + Inputs, CollapsedOffloadActions); if (!T) return InputInfo(); - // If we've collapsed action list that contained CudaHostAction we + // If we've collapsed action list that contained OffloadAction we // need to build jobs for device-side inputs it may have held. - if (CollapsedCHA) { - for (const Action *DA : CollapsedCHA->getDeviceActions()) { - CudaDeviceInputInfos.push_back(BuildJobsForAction( - C, DA, TC, "", AtTopLevel, - /*MultipleArchs*/ false, LinkingOutput, CachedResults)); - } + for (const auto *OA : CollapsedOffloadActions) { + cast<OffloadAction>(OA)->doOnEachDeviceDependence( + [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + OffloadDeviceInputInfos.push_back(BuildJobsForAction( + C, DepA, DepTC, DepBoundArch, AtTopLevel, + /*MultipleArchs=*/!!DepBoundArch, LinkingOutput, CachedResults)); + }); } // Only use pipes when there is exactly one input. @@ -2047,17 +2142,18 @@ if (JA->getType() == types::TY_dSYM) BaseInput = InputInfos[0].getFilename(); - // Append outputs of cuda device jobs to the input list - if (CudaDeviceInputInfos.size()) - InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end()); + // Append outputs of offload device jobs to the input list + if (!OffloadDeviceInputInfos.empty()) + InputInfos.append(OffloadDeviceInputInfos.begin(), + OffloadDeviceInputInfos.end()); // Determine the place to write output to, if any. InputInfo Result; if (JA->getType() == types::TY_Nothing) Result = InputInfo(A, BaseInput); else Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch, - AtTopLevel, MultipleArchs), + AtTopLevel, MultipleArchs, TC), BaseInput); if (CCCPrintBindings && !CCGenDiagnostics) { @@ -2117,7 +2213,8 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA, const char *BaseInput, const char *BoundArch, bool AtTopLevel, - bool MultipleArchs) const { + bool MultipleArchs, + const ToolChain *TC) const { llvm::PrettyStackTraceString CrashInfo("Computing output path"); // Output to a user requested destination? if (AtTopLevel && !isa<DsymutilJobAction>(JA) && !isa<VerifyJobAction>(JA)) { @@ -2203,6 +2300,7 @@ MakeCLOutputFilename(C.getArgs(), "", BaseName, types::TY_Image); } else if (MultipleArchs && BoundArch) { SmallString<128> Output(getDefaultImageName()); + Output += JA.getOffloadingFileNamePrefix(TC); Output += "-"; Output.append(BoundArch); NamedOutput = C.getArgs().MakeArgString(Output.c_str()); @@ -2219,6 +2317,7 @@ if (!types::appendSuffixForType(JA.getType())) End = BaseName.rfind('.'); SmallString<128> Suffixed(BaseName.substr(0, End)); + Suffixed += JA.getOffloadingFileNamePrefix(TC); if (MultipleArchs && BoundArch) { Suffixed += "-"; Suffixed.append(BoundArch); Index: lib/Driver/Action.cpp =================================================================== --- lib/Driver/Action.cpp +++ lib/Driver/Action.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "clang/Driver/Action.h" +#include "clang/Driver/ToolChain.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Regex.h" @@ -21,8 +22,7 @@ switch (AC) { case InputClass: return "input"; case BindArchClass: return "bind-arch"; - case CudaDeviceClass: return "cuda-device"; - case CudaHostClass: return "cuda-host"; + case OffloadClass: return "offload"; case PreprocessJobClass: return "preprocessor"; case PrecompileJobClass: return "precompiler"; case AnalyzeJobClass: return "analyzer"; @@ -40,6 +40,79 @@ llvm_unreachable("invalid class"); } +void Action::propagateDeviceOffloadInfo(OffloadKind OKind, + const char *OArch) const { + // Offload action set its own kinds on their dependences. + if (Kind == OffloadClass) + return; + + assert( + (OffloadingDeviceKind == OKind || OffloadingDeviceKind == OFFLOAD_None) && + "Setting device kind to a different device??"); + assert(!OffloadingHostKind && "Setting a device kind in a host action??"); + OffloadingDeviceKind = OKind; + OffloadingArch = OArch; + + for (auto *A : Inputs) + A->propagateDeviceOffloadInfo(OffloadingDeviceKind, OArch); +} + +void Action::propagateHostOffloadInfo(unsigned OKinds, + const char *OArch) const { + // Offload action set its own kinds on their dependences. + if (Kind == OffloadClass) + return; + + assert(OffloadingDeviceKind == OFFLOAD_None && + "Setting a host kind in a device action."); + OffloadingHostKind |= OKinds; + OffloadingArch = OArch; + + for (auto *A : Inputs) + A->propagateHostOffloadInfo(OffloadingHostKind, OArch); +} + +void Action::propagateOffloadInfo(const Action *A) const { + if (unsigned HK = A->getOffloadingHostKinds()) + propagateHostOffloadInfo(HK, A->getOffloadingArch()); + else + propagateDeviceOffloadInfo(A->getOffloadingDeviceKind(), + A->getOffloadingArch()); +} + +std::string Action::getOffloadingKindPrefix() const { + switch (OffloadingDeviceKind) { + case OFFLOAD_None: + break; + case OFFLOAD_CUDA: + return "device-cuda"; + // Add other programming models here. + } + + if (!OffloadingHostKind) + return ""; + + std::string Res("host"); + if (OffloadingHostKind & OFFLOAD_CUDA) + Res += "-cuda"; + // Add other programming models here. + + return Res; +} + +std::string Action::getOffloadingFileNamePrefix(const ToolChain *TC) const { + // A file prefix is only generated for device actions and consists of the + // offload kind and triple. + if (!OffloadingDeviceKind) + return ""; + + std::string Res("-"); + Res += getOffloadingKindPrefix(); + Res += "-"; + Res += TC->getTriple().normalize(); + return Res; +} + void InputAction::anchor() {} InputAction::InputAction(const Arg &_Input, types::ID _Type) @@ -51,45 +124,106 @@ BindArchAction::BindArchAction(Action *Input, const char *_ArchName) : Action(BindArchClass, Input), ArchName(_ArchName) {} -// Converts CUDA GPU architecture, e.g. "sm_21", to its corresponding virtual -// compute arch, e.g. "compute_20". Returns null if the input arch is null or -// doesn't match an existing arch. -static const char* GpuArchToComputeName(const char *ArchName) { - if (!ArchName) - return nullptr; - return llvm::StringSwitch<const char *>(ArchName) - .Cases("sm_20", "sm_21", "compute_20") - .Case("sm_30", "compute_30") - .Case("sm_32", "compute_32") - .Case("sm_35", "compute_35") - .Case("sm_37", "compute_37") - .Case("sm_50", "compute_50") - .Case("sm_52", "compute_52") - .Case("sm_53", "compute_53") - .Default(nullptr); +void OffloadAction::anchor() {} + +OffloadAction::OffloadAction(const HostDependence &HDep) + : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()) { + OffloadingArch = HDep.getBoundArch(); + OffloadingHostKind = HDep.getOffloadKinds(); + HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(), + HDep.getBoundArch()); +}; + +OffloadAction::OffloadAction(const DeviceDependences &DDeps, types::ID Ty) + : Action(OffloadClass, DDeps.getActions(), Ty), HostTC(nullptr), + DevToolChains(DDeps.getToolChains()) { + auto &OKinds = DDeps.getOffloadKinds(); + auto &BArchs = DDeps.getBoundArchs(); + + // If we have a single dependency, inherit the offloading info from it. + if (OKinds.size() == 1) { + OffloadingDeviceKind = OKinds.front(); + OffloadingArch = BArchs.front(); + } + // Propagate info to the dependencies. + for (unsigned i = 0; i < getInputs().size(); ++i) + getInputs()[i]->propagateDeviceOffloadInfo(OKinds[i], BArchs[i]); } -void CudaDeviceAction::anchor() {} +OffloadAction::OffloadAction(const HostDependence &HDep, + const DeviceDependences &DDeps) + : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()), + DevToolChains(DDeps.getToolChains()) { + // We use the kinds of the host dependence for this action. + OffloadingArch = HDep.getBoundArch(); + OffloadingHostKind = HDep.getOffloadKinds(); + HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(), + HDep.getBoundArch()); + + // Add device inputs and propagate info to the device actions. + for (unsigned i = 0; i < DDeps.getActions().size(); ++i) { + auto *A = DDeps.getActions()[i]; + // Skip actions of empty dependences. + if (!A) + continue; + getInputs().push_back(A); + A->propagateDeviceOffloadInfo(DDeps.getOffloadKinds()[i], + DDeps.getBoundArchs()[i]); + } +} -CudaDeviceAction::CudaDeviceAction(Action *Input, const char *ArchName, - bool AtTopLevel) - : Action(CudaDeviceClass, Input), GpuArchName(ArchName), - AtTopLevel(AtTopLevel) { - assert(!GpuArchName || IsValidGpuArchName(GpuArchName)); +void OffloadAction::doOnHostDependence(const OffloadActionWorkTy &Work) const { + if (!HostTC) + return; + auto *A = getInputs().front(); + Work(A, HostTC, A->getOffloadingArch()); } -const char *CudaDeviceAction::getComputeArchName() const { - return GpuArchToComputeName(GpuArchName); +void OffloadAction::doOnEachDeviceDependence( + const OffloadActionWorkTy &Work) const { + auto I = getInputs().begin(); + auto E = getInputs().end(); + if (I == E) + return; + + // Skip host action + if (HostTC) + ++I; + + auto TI = DevToolChains.begin(); + for (; I != E; ++I) + Work(*I, *TI, (*I)->getOffloadingArch()); } -bool CudaDeviceAction::IsValidGpuArchName(llvm::StringRef ArchName) { - return GpuArchToComputeName(ArchName.data()) != nullptr; +void OffloadAction::doOnEachDependence(const OffloadActionWorkTy &Work) const { + doOnHostDependence(Work); + doOnEachDeviceDependence(Work); } -void CudaHostAction::anchor() {} +Action *OffloadAction::getHostDependence() const { + return HostTC ? getInputs().front() : nullptr; +} + +Action *OffloadAction::getSingleDeviceDependence() const { + return (!HostTC && getInputs().size() == 1) ? getInputs().front() : nullptr; +} -CudaHostAction::CudaHostAction(Action *Input, const ActionList &DeviceActions) - : Action(CudaHostClass, Input), DeviceActions(DeviceActions) {} +void OffloadAction::DeviceDependences::add(Action *A, const ToolChain *TC, + const char *BoundArch, + OffloadKind OKind) { + AL.push_back(A); + TCL.push_back(TC); + BAL.push_back(BoundArch); + KL.push_back(OKind); +} + +OffloadAction::HostDependence::HostDependence(Action *A, const ToolChain *TC, + const char *BoundArch, + const DeviceDependences &DDeps) + : A(A), TC(TC), BoundArch(BoundArch), OffloadKinds(0u) { + for (auto K : DDeps.getOffloadKinds()) + OffloadKinds |= K; +} void JobAction::anchor() {} Index: include/clang/Driver/Driver.h =================================================================== --- include/clang/Driver/Driver.h +++ include/clang/Driver/Driver.h @@ -415,12 +415,11 @@ /// \param BoundArch - The bound architecture. /// \param AtTopLevel - Whether this is a "top-level" action. /// \param MultipleArchs - Whether multiple -arch options were supplied. - const char *GetNamedOutputPath(Compilation &C, - const JobAction &JA, - const char *BaseInput, - const char *BoundArch, - bool AtTopLevel, - bool MultipleArchs) const; + /// \param TC - Toolchain associated with the output. + const char *GetNamedOutputPath(Compilation &C, const JobAction &JA, + const char *BaseInput, const char *BoundArch, + bool AtTopLevel, bool MultipleArchs, + const ToolChain *TC) const; /// GetTemporaryPath - Return the pathname of a temporary file to use /// as part of compilation; the file will have the given prefix and suffix. Index: include/clang/Driver/Action.h =================================================================== --- include/clang/Driver/Action.h +++ include/clang/Driver/Action.h @@ -12,6 +12,7 @@ #include "clang/Driver/Types.h" #include "clang/Driver/Util.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" namespace llvm { @@ -26,6 +27,8 @@ namespace clang { namespace driver { +class ToolChain; + /// Action - Represent an abstract compilation step to perform. /// /// An action represents an edge in the compilation graph; typically @@ -49,8 +52,7 @@ enum ActionClass { InputClass = 0, BindArchClass, - CudaDeviceClass, - CudaHostClass, + OffloadClass, PreprocessJobClass, PrecompileJobClass, AnalyzeJobClass, @@ -70,10 +72,6 @@ // The offloading kind determines if this action is binded to a particular // programming model. Each entry reserves one bit. - // - // FIXME: This is currently used to indicate that toolchains are used in a - // given programming as well, but will be used here as well once a generic - // offloading action is implemented. enum OffloadKind { OFFLOAD_None = 0x00, OFFLOAD_CUDA = 0x01, @@ -90,13 +88,24 @@ ActionList Inputs; protected: + /// Offload information. It has to be mutable as it needs to be adjusted if + /// actions are integrated. + /// \brief Multiple programming models may be supported simultaneously by the + /// same host. Therefore, the host offloading kind is a combination of kinds. + mutable unsigned OffloadingHostKind; + /// \brief Offloading kind of the device. + mutable OffloadKind OffloadingDeviceKind; + /// \brief The Offloading architecture associated with this action. + mutable const char *OffloadingArch; + Action(ActionClass Kind, types::ID Type) : Action(Kind, ActionList(), Type) {} Action(ActionClass Kind, Action *Input, types::ID Type) : Action(Kind, ActionList({Input}), Type) {} Action(ActionClass Kind, Action *Input) : Action(Kind, ActionList({Input}), Input->getType()) {} Action(ActionClass Kind, const ActionList &Inputs, types::ID Type) - : Kind(Kind), Type(Type), Inputs(Inputs) {} + : Kind(Kind), Type(Type), Inputs(Inputs), OffloadingHostKind(0u), + OffloadingDeviceKind(OFFLOAD_None), OffloadingArch(nullptr) {} public: virtual ~Action(); @@ -119,6 +128,36 @@ input_const_range inputs() const { return input_const_range(input_begin(), input_end()); } + + std::string getOffloadingKindPrefix() const; + std::string getOffloadingFileNamePrefix(const ToolChain *TC) const; + + /// \brief Set the device offload info of this action and propagate it to its + /// dependences. + void propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch) const; + /// \brief Append the host offload info of this action and propagate it to its + /// dependences. + void propagateHostOffloadInfo(unsigned OKinds, const char *OArch) const; + /// \brief Set the offload info of this action to be the same as the provided + /// action, and propagate it to its dependences. + void propagateOffloadInfo(const Action *A) const; + + unsigned getOffloadingHostKinds() const { return OffloadingHostKind; } + OffloadKind getOffloadingDeviceKind() const { return OffloadingDeviceKind; } + const char *getOffloadingArch() const { return OffloadingArch; } + + /// \brief Check if this action have any offload kinds. Note that host offload + /// kinds are only set if the action is a dependence to an host offload + /// action. + bool isHostOffloading(OffloadKind OKind) const { + return OffloadingHostKind & OKind; + } + bool isDeviceOffloading(OffloadKind OKind) const { + return OffloadingDeviceKind == OKind; + } + bool isOffloading(OffloadKind OKind) const { + return isHostOffloading(OKind) || isDeviceOffloading(OKind); + } }; class InputAction : public Action { @@ -151,43 +190,102 @@ } }; -class CudaDeviceAction : public Action { +/// \brief An offload action combines host or/and device actions according to +/// the programming model implementation needs and propagates the offloading +/// kind to its dependences. +class OffloadAction : public Action { virtual void anchor(); - /// GPU architecture to bind. Always of the form /sm_\d+/ or null (when the - /// action applies to multiple architectures). - const char *GpuArchName; - /// True when action results are not consumed by the host action (e.g when - /// -fsyntax-only or --cuda-device-only options are used). - bool AtTopLevel; - public: - CudaDeviceAction(Action *Input, const char *ArchName, bool AtTopLevel); + /// \brief Type used to communicate device actions. It associates bound + /// architecture, toolchain, and offload kind to each action. + class DeviceDependences { + public: + typedef SmallVector<const ToolChain *, 3> ToolChainList; + typedef SmallVector<const char *, 3> BoundArchList; + typedef SmallVector<OffloadKind, 3> OffloadKindList; + + private: + /// \brief The dependence action. + ActionList AL; + /// \brief The offloading toolchains that should be used with the action. + SmallVector<const ToolChain *, 3> TCL; + /// \brief The architectures that should be used with this action. + SmallVector<const char *, 3> BAL; + /// \brief The offload kind of each dependence. + SmallVector<OffloadKind, 3> KL; + + public: + /// \brief Add a action along with the associated toolchain, bound arch, and + /// offload kind. + void add(Action *A, const ToolChain *TC, const char *BoundArch, + OffloadKind OKind); + + /// \brief Get each of the individual arrays. + const ActionList &getActions() const { return AL; }; + const ToolChainList &getToolChains() const { return TCL; }; + const BoundArchList &getBoundArchs() const { return BAL; }; + const OffloadKindList &getOffloadKinds() const { return KL; }; + }; + + /// \brief Type used to communicate host actions. It associates bound + /// architecture, toolchain, and offload kinds to each action. + class HostDependence { + /// \brief The dependence action. + Action *A; + /// \brief The offloading toolchain that should be used with the action. + const ToolChain *TC; + /// \brief The architectures that should be used with this action. + const char *BoundArch; + /// \brief The offload kind of each dependence. + unsigned OffloadKinds; + + public: + HostDependence(Action *A, const ToolChain *TC, const char *BoundArch, + const unsigned OffloadKinds) + : A(A), TC(TC), BoundArch(BoundArch), OffloadKinds(OffloadKinds){}; + /// \brief Constructor version that obtains the offload kinds from the + /// device dependencies. + HostDependence(Action *A, const ToolChain *TC, const char *BoundArch, + const DeviceDependences &DDeps); + Action *getAction() const { return A; }; + const ToolChain *getToolChain() const { return TC; }; + const char *getBoundArch() const { return BoundArch; }; + unsigned getOffloadKinds() const { return OffloadKinds; }; + }; - const char *getGpuArchName() const { return GpuArchName; } + typedef llvm::function_ref<void(Action *, const ToolChain *, const char *)> + OffloadActionWorkTy; - /// Gets the compute_XX that corresponds to getGpuArchName(). Returns null - /// when getGpuArchName() is null. - const char *getComputeArchName() const; +private: + /// \brief The offloading toolchain that should be used with the action. + const ToolChain *HostTC; - bool isAtTopLevel() const { return AtTopLevel; } + /// \brief The tool chains associated with the list of actions. + DeviceDependences::ToolChainList DevToolChains; - static bool IsValidGpuArchName(llvm::StringRef ArchName); +public: + OffloadAction(const HostDependence &HDep); + OffloadAction(const DeviceDependences &DDeps, types::ID Ty); + OffloadAction(const HostDependence &HDep, const DeviceDependences &DDeps); - static bool classof(const Action *A) { - return A->getKind() == CudaDeviceClass; - } -}; + /// \brief Execute the work specified in \a Work on the host dependence. + void doOnHostDependence(const OffloadActionWorkTy &Work) const; -class CudaHostAction : public Action { - virtual void anchor(); - ActionList DeviceActions; + /// \brief Execute the work specified in \a Work on each device dependence. + void doOnEachDeviceDependence(const OffloadActionWorkTy &Work) const; -public: - CudaHostAction(Action *Input, const ActionList &DeviceActions); + /// \brief Execute the work specified in \a Work on each dependence. + void doOnEachDependence(const OffloadActionWorkTy &Work) const; + + /// \brief Return the host dependence of this action, or null if we don't have + /// any. + Action *getHostDependence() const; - const ActionList &getDeviceActions() const { return DeviceActions; } + /// \brief Return the single device dependence of this action, or null if we + /// don't have one or we have more than one. + Action *getSingleDeviceDependence() const; - static bool classof(const Action *A) { return A->getKind() == CudaHostClass; } + static bool classof(const Action *A) { return A->getKind() == OffloadClass; } }; class JobAction : public Action {
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits