This revision was landed with ongoing or failed builds. This revision was automatically updated to reflect the committed changes. Closed by commit rG3530c35c6609: [OpenMP] Use CUDA's non-RDC mode when LTO has whole program visibility (authored by jhuber6).
Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D124292/new/ https://reviews.llvm.org/D124292 Files: clang/test/Driver/linker-wrapper.c clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp =================================================================== --- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -595,7 +595,7 @@ // TODO: Move these to a separate file. namespace nvptx { Expected<std::string> assemble(StringRef InputFile, Triple TheTriple, - StringRef Arch) { + StringRef Arch, bool RDC = true) { // NVPTX uses the ptxas binary to create device object files. Expected<std::string> PtxasPath = findProgram("ptxas", {CudaBinaryPath}); if (!PtxasPath) @@ -626,7 +626,8 @@ CmdArgs.push_back(Opt); CmdArgs.push_back("--gpu-name"); CmdArgs.push_back(Arch); - CmdArgs.push_back("-c"); + if (RDC) + CmdArgs.push_back("-c"); CmdArgs.push_back(InputFile); @@ -933,7 +934,8 @@ } Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles, - const Triple &TheTriple, StringRef Arch) { + const Triple &TheTriple, StringRef Arch, + bool &WholeProgram) { SmallVector<std::unique_ptr<MemoryBuffer>, 4> SavedBuffers; SmallVector<std::unique_ptr<lto::InputFile>, 4> BitcodeFiles; SmallVector<std::string, 4> NewInputFiles; @@ -1009,7 +1011,7 @@ }; // We assume visibility of the whole program if every input file was bitcode. - bool WholeProgram = BitcodeFiles.size() == InputFiles.size(); + WholeProgram = BitcodeFiles.size() == InputFiles.size(); auto LTOBackend = (EmbedBitcode) ? createLTO(TheTriple, Arch, WholeProgram, OutputBitcode) : createLTO(TheTriple, Arch, WholeProgram); @@ -1089,7 +1091,7 @@ // Is we are compiling for NVPTX we need to run the assembler first. if (TheTriple.isNVPTX() && !EmbedBitcode) { for (auto &File : Files) { - auto FileOrErr = nvptx::assemble(File, TheTriple, Arch); + auto FileOrErr = nvptx::assemble(File, TheTriple, Arch, !WholeProgram); if (!FileOrErr) return FileOrErr.takeError(); File = *FileOrErr; @@ -1117,10 +1119,11 @@ for (auto &LinkerInput : LinkerInputMap) { DeviceFile &File = LinkerInput.getFirst(); Triple TheTriple = Triple(File.TheTriple); + bool WholeProgram = false; // Run LTO on any bitcode files and replace the input with the result. - if (Error Err = - linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, File.Arch)) + if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, + File.Arch, WholeProgram)) return Err; // If we are embedding bitcode for JIT, skip the final device linking. @@ -1130,6 +1133,14 @@ continue; } + // If we performed LTO on NVPTX and had whole program visibility, we can use + // CUDA in non-RDC mode. + if (WholeProgram && TheTriple.isNVPTX()) { + assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed"); + LinkedImages.push_back(LinkerInput.getSecond().front()); + continue; + } + auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch); if (!ImageOrErr) return ImageOrErr.takeError(); Index: clang/test/Driver/linker-wrapper.c =================================================================== --- clang/test/Driver/linker-wrapper.c +++ clang/test/Driver/linker-wrapper.c @@ -38,5 +38,5 @@ // RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \ // RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LTO -// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 -c {{.*}}.s -// LTO: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.cubin +// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 {{.*}}.s +// LTO-NOT: nvlink
Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp =================================================================== --- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -595,7 +595,7 @@ // TODO: Move these to a separate file. namespace nvptx { Expected<std::string> assemble(StringRef InputFile, Triple TheTriple, - StringRef Arch) { + StringRef Arch, bool RDC = true) { // NVPTX uses the ptxas binary to create device object files. Expected<std::string> PtxasPath = findProgram("ptxas", {CudaBinaryPath}); if (!PtxasPath) @@ -626,7 +626,8 @@ CmdArgs.push_back(Opt); CmdArgs.push_back("--gpu-name"); CmdArgs.push_back(Arch); - CmdArgs.push_back("-c"); + if (RDC) + CmdArgs.push_back("-c"); CmdArgs.push_back(InputFile); @@ -933,7 +934,8 @@ } Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles, - const Triple &TheTriple, StringRef Arch) { + const Triple &TheTriple, StringRef Arch, + bool &WholeProgram) { SmallVector<std::unique_ptr<MemoryBuffer>, 4> SavedBuffers; SmallVector<std::unique_ptr<lto::InputFile>, 4> BitcodeFiles; SmallVector<std::string, 4> NewInputFiles; @@ -1009,7 +1011,7 @@ }; // We assume visibility of the whole program if every input file was bitcode. - bool WholeProgram = BitcodeFiles.size() == InputFiles.size(); + WholeProgram = BitcodeFiles.size() == InputFiles.size(); auto LTOBackend = (EmbedBitcode) ? createLTO(TheTriple, Arch, WholeProgram, OutputBitcode) : createLTO(TheTriple, Arch, WholeProgram); @@ -1089,7 +1091,7 @@ // Is we are compiling for NVPTX we need to run the assembler first. if (TheTriple.isNVPTX() && !EmbedBitcode) { for (auto &File : Files) { - auto FileOrErr = nvptx::assemble(File, TheTriple, Arch); + auto FileOrErr = nvptx::assemble(File, TheTriple, Arch, !WholeProgram); if (!FileOrErr) return FileOrErr.takeError(); File = *FileOrErr; @@ -1117,10 +1119,11 @@ for (auto &LinkerInput : LinkerInputMap) { DeviceFile &File = LinkerInput.getFirst(); Triple TheTriple = Triple(File.TheTriple); + bool WholeProgram = false; // Run LTO on any bitcode files and replace the input with the result. - if (Error Err = - linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, File.Arch)) + if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, + File.Arch, WholeProgram)) return Err; // If we are embedding bitcode for JIT, skip the final device linking. @@ -1130,6 +1133,14 @@ continue; } + // If we performed LTO on NVPTX and had whole program visibility, we can use + // CUDA in non-RDC mode. + if (WholeProgram && TheTriple.isNVPTX()) { + assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed"); + LinkedImages.push_back(LinkerInput.getSecond().front()); + continue; + } + auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch); if (!ImageOrErr) return ImageOrErr.takeError(); Index: clang/test/Driver/linker-wrapper.c =================================================================== --- clang/test/Driver/linker-wrapper.c +++ clang/test/Driver/linker-wrapper.c @@ -38,5 +38,5 @@ // RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \ // RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LTO -// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 -c {{.*}}.s -// LTO: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.cubin +// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 {{.*}}.s +// LTO-NOT: nvlink
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits