jhuber6 created this revision. jhuber6 added reviewers: jdoerfert, JonChesterfield, tra, yaxunl. Herald added subscribers: mattd, gchakrabarti, asavonic, inglorion. Herald added a project: All. jhuber6 requested review of this revision. Herald added a project: clang. Herald added a subscriber: cfe-commits.
One current downside of the LLVM support for CUDA in RDC-mode is that we cannot JIT off of the PTX image. This requires the user to provide the specific architecture when offloading. CUDA's runtime uses a special method to link the separate PTX files when in RDC-mode, while LLVM cannot do this with the chosen approach to supporting RDC-mode compilation. However, if we embed bitcode via LTO we can use the single-linked PTX image for the whole module and include it in the fatbinary. This allows us to do the following and have it execute even without the correct architecture specified. clang foo.cu -foffload-lto -fgpu-rdc --offload-new-driver -lcudart It is also worth noting that in full-LTO mode, RDC-mode will behave exactly like non-RDC mode after linking. Depends on D127246 <https://reviews.llvm.org/D127246> Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D127901 Files: clang/test/Driver/linker-wrapper.c clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp =================================================================== --- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -557,9 +557,16 @@ CmdArgs.push_back(TheTriple.isArch64Bit() ? "-64" : "-32"); CmdArgs.push_back("--create"); CmdArgs.push_back(TempFile); - for (const auto &FileAndArch : InputFiles) - CmdArgs.push_back(Saver.save("--image=profile=" + std::get<1>(FileAndArch) + - ",file=" + std::get<0>(FileAndArch))); + for (const auto &FileAndArch : InputFiles) { + if (std::get<0>(FileAndArch).endswith(".s")) + CmdArgs.push_back(Saver.save("--image=profile=compute_" + + std::get<1>(FileAndArch).split("_").second + + ",file=" + std::get<0>(FileAndArch))); + else + CmdArgs.push_back( + Saver.save("--image=profile=" + std::get<1>(FileAndArch) + + ",file=" + std::get<0>(FileAndArch))); + } if (Error Err = executeCommands(*FatBinaryPath, CmdArgs)) return std::move(Err); @@ -820,6 +827,7 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles, SmallVectorImpl<std::string> &OutputFiles, + SmallVectorImpl<std::string> &AuxOutputFiles, const Triple &TheTriple, StringRef Arch) { SmallVector<OffloadFile, 4> BitcodeInputFiles; DenseSet<StringRef> UsedInRegularObj; @@ -998,6 +1006,7 @@ // Is we are compiling for NVPTX we need to run the assembler first. if (TheTriple.isNVPTX()) { for (auto &File : Files) { + AuxOutputFiles.push_back(static_cast<std::string>(File)); auto FileOrErr = nvptx::assemble(File, TheTriple, Arch, !WholeProgram); if (!FileOrErr) return FileOrErr.takeError(); @@ -1187,7 +1196,9 @@ // First link and remove all the input files containing bitcode. SmallVector<std::string> InputFiles; - if (Error Err = linkBitcodeFiles(Input, InputFiles, Triple, Arch)) + SmallVector<std::string> OutputFiles; + if (Error Err = + linkBitcodeFiles(Input, InputFiles, OutputFiles, Triple, Arch)) return Err; // Write any remaining device inputs to an output file for the linker job. @@ -1205,20 +1216,27 @@ : InputFiles.front(); if (!OutputOrErr) return OutputOrErr.takeError(); + OutputFiles.push_back(*OutputOrErr); - // Store the offloading image for each linked output file. + // Store the offloading image for each output file. for (OffloadKind Kind : ActiveOffloadKinds) { - llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> FileOrErr = - llvm::MemoryBuffer::getFileOrSTDIN(*OutputOrErr); - if (std::error_code EC = FileOrErr.getError()) - return createFileError(*OutputOrErr, EC); - - OffloadingImage TheImage{}; - TheImage.TheImageKind = IMG_Object; - TheImage.TheOffloadKind = Kind; - TheImage.StringData = {{"triple", TripleStr}, {"arch", Arch}}; - TheImage.Image = std::move(*FileOrErr); - Images[Kind].emplace_back(std::move(TheImage)); + for (StringRef Output : OutputFiles) { + // Ignore any PTX output if we're not creating a fatbinary. + if (Output.endswith(".s") && Kind != OFK_Cuda) + continue; + + llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> FileOrErr = + llvm::MemoryBuffer::getFileOrSTDIN(Output); + if (std::error_code EC = FileOrErr.getError()) + return createFileError(Output, EC); + + OffloadingImage TheImage{}; + TheImage.TheImageKind = Output.endswith(".s") ? IMG_PTX : IMG_Object; + TheImage.TheOffloadKind = Kind; + TheImage.StringData = {{"triple", TripleStr}, {"arch", Arch}}; + TheImage.Image = std::move(*FileOrErr); + Images[Kind].emplace_back(std::move(TheImage)); + } } } Index: clang/test/Driver/linker-wrapper.c =================================================================== --- clang/test/Driver/linker-wrapper.c +++ clang/test/Driver/linker-wrapper.c @@ -81,6 +81,16 @@ // CUDA: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.o {{.*}}.o // CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_52,file={{.*}}.out --image=profile=sm_70,file={{.*}}.out +// RUN: clang-offload-packager -o %t.out \ +// RUN: --image=file=%S/Inputs/dummy-bc.bc,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_52 +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ +// RUN: -fembed-offload-object=%t.out +// RUN: clang-linker-wrapper --dry-run --host-triple x86_64-unknown-linux-gnu -linker-path \ +// RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-LTO + +// CUDA-LTO: ptxas{{.*}}-m64 -o [[CUBIN:.+]] -O2 --gpu-name sm_52 [[PTX:.+]] +// CUDA-LTO: fatbinary{{.*}}-64 --create [[FATBINARY:.+]] --image=profile=compute_52,file=[[PTX]] --image=profile=sm_52,file=[[CUBIN]] + // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits