This revision was automatically updated to reflect the committed changes. Closed by commit rG0f7e8631547a: [LinkerWrapper] Perform device linking steps in parallel (authored by jhuber6).
Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D136701/new/ https://reviews.llvm.org/D136701 Files: clang/test/Driver/linker-wrapper.c clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
Index: clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td =================================================================== --- clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -59,6 +59,10 @@ Flags<[WrapperOnlyOption]>, MetaVarName<"<number>">, HelpText<"Set the granularity of time-trace updates">; +def wrapper_jobs : Joined<["--"], "wrapper-jobs=">, + Flags<[WrapperOnlyOption]>, MetaVarName<"<number>">, + HelpText<"Sets the number of parallel jobs to use for device linking">; + // Flags passed to the device linker. def arch_EQ : Joined<["--"], "arch=">, Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"<arch>">, Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp =================================================================== --- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -42,6 +42,7 @@ #include "llvm/Support/Host.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Parallel.h" #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" #include "llvm/Support/Signals.h" @@ -1119,19 +1120,34 @@ /// be registered by the runtime. Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles, - const InputArgList &Args) { + const InputArgList &Args, char **Argv, int Argc) { llvm::TimeTraceScope TimeScope("Handle all device input"); - DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile, 4>> InputsForTarget; + DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> InputMap; for (auto &File : LinkerInputFiles) - InputsForTarget[File].emplace_back(std::move(File)); + InputMap[File].emplace_back(std::move(File)); LinkerInputFiles.clear(); - DenseMap<OffloadKind, SmallVector<OffloadingImage, 2>> Images; - for (auto &[ID, Input] : InputsForTarget) { + SmallVector<SmallVector<OffloadFile>> InputsForTarget; + for (auto &[ID, Input] : InputMap) + InputsForTarget.emplace_back(std::move(Input)); + InputMap.clear(); + + std::mutex ImageMtx; + DenseMap<OffloadKind, SmallVector<OffloadingImage>> Images; + auto Err = parallelForEachError(InputsForTarget, [&](auto &Input) -> Error { llvm::TimeTraceScope TimeScope("Link device input"); - auto LinkerArgs = getLinkerArgs(Input, Args); + // Each thread needs its own copy of the base arguments to maintain + // per-device argument storage of synthetic strings. + const OptTable &Tbl = getOptTable(); + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + auto BaseArgs = + Tbl.parseArgs(Argc, Argv, OPT_INVALID, Saver, [](StringRef Err) { + reportError(createStringError(inconvertibleErrorCode(), Err)); + }); + auto LinkerArgs = getLinkerArgs(Input, BaseArgs); DenseSet<OffloadKind> ActiveOffloadKinds; for (const auto &File : Input) @@ -1142,7 +1158,7 @@ if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs)) return std::move(Err); - // Write any remaining device inputs to an output file for the linker job. + // Write any remaining device inputs to an output file for the linker. for (const OffloadFile &File : Input) { auto FileNameOrErr = writeOffloadFile(File); if (!FileNameOrErr) @@ -1150,7 +1166,7 @@ InputFiles.emplace_back(*FileNameOrErr); } - // Link the remaining device files, if necessary, using the device linker. + // Link the remaining device files using the device linker. llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ)); bool RequiresLinking = !Args.hasArg(OPT_embed_bitcode) && @@ -1171,17 +1187,31 @@ TheImage.TheImageKind = IMG_Object; TheImage.TheOffloadKind = Kind; TheImage.StringData = { - {"triple", LinkerArgs.getLastArgValue(OPT_triple_EQ)}, - {"arch", LinkerArgs.getLastArgValue(OPT_arch_EQ)}}; + {"triple", + Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_triple_EQ))}, + {"arch", + Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_arch_EQ))}}; TheImage.Image = std::move(*FileOrErr); + + std::lock_guard<decltype(ImageMtx)> Guard(ImageMtx); Images[Kind].emplace_back(std::move(TheImage)); } - } + return Error::success(); + }); + if (Err) + return std::move(Err); // Create a binary image of each offloading image and embed it into a new // object file. SmallVector<StringRef> WrappedOutput; - for (const auto &[Kind, Input] : Images) { + for (auto &[Kind, Input] : Images) { + // We sort the entries before bundling so they appear in a deterministic + // order in the final binary. + llvm::sort(Input, [](OffloadingImage &A, OffloadingImage &B) { + return A.StringData["triple"].compare(B.StringData["triple"]) == 1 || + A.StringData["arch"].compare(B.StringData["arch"]) == 1 || + A.TheOffloadKind < B.TheOffloadKind; + }); auto BundledImagesOrErr = bundleLinkedOutput(Input, Args, Kind); if (!BundledImagesOrErr) return BundledImagesOrErr.takeError(); @@ -1362,6 +1392,16 @@ if (!CudaBinaryPath.empty()) CudaBinaryPath = CudaBinaryPath + "/bin"; + parallel::strategy = hardware_concurrency(1); + if (auto *Arg = Args.getLastArg(OPT_wrapper_jobs)) { + unsigned Threads = 0; + if (!llvm::to_integer(Arg->getValue(), Threads) || Threads == 0) + reportError(createStringError( + inconvertibleErrorCode(), "%s: expected a positive integer, got '%s'", + Arg->getSpelling().data(), Arg->getValue())); + parallel::strategy = hardware_concurrency(Threads); + } + if (Args.hasArg(OPT_wrapper_time_trace_eq)) { unsigned Granularity; Args.getLastArgValue(OPT_wrapper_time_trace_granularity, "500") @@ -1378,7 +1418,8 @@ reportError(DeviceInputFiles.takeError()); // Link and wrap the device images extracted from the linker input. - auto FilesOrErr = linkAndWrapDeviceFiles(*DeviceInputFiles, Args); + auto FilesOrErr = + linkAndWrapDeviceFiles(*DeviceInputFiles, Args, Argv, Argc); if (!FilesOrErr) reportError(FilesOrErr.takeError()); Index: clang/test/Driver/linker-wrapper.c =================================================================== --- clang/test/Driver/linker-wrapper.c +++ clang/test/Driver/linker-wrapper.c @@ -107,7 +107,19 @@ // CUDA: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_52 {{.*}}.o // CUDA: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.o {{.*}}.o -// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_52,file={{.*}}.out --image=profile=sm_70,file={{.*}}.out +// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_70,file={{.*}}.out --image=profile=sm_52,file={{.*}}.out + +// RUN: clang-offload-packager -o %t.out \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_80 \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_75 \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_70 \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_52 +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ +// RUN: -fembed-offload-object=%t.out +// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=4 \ +// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-PAR + +// CUDA-PAR: fatbinary{{.*}}-64 --create {{.*}}.fatbin // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx90a \ @@ -120,7 +132,7 @@ // HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.out {{.*}}.o // HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx90a -o {{.*}}.out {{.*}}.o -// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input={{.*}}.out -input={{.*}}out -output={{.*}}.hipfb +// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx908 -input=/dev/null -input={{.*}}.out -input={{.*}}out -output={{.*}}.hipfb // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits