https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/133164
>From 8c8885d486ac72145d7fd7ba003db7ea9ab17a1f Mon Sep 17 00:00:00 2001 From: Joseph Huber <hube...@outlook.com> Date: Wed, 26 Mar 2025 16:03:20 -0500 Subject: [PATCH 1/3] [Clang] Make `--lto-partitions` only default for HIP Summary: The default behavior for LTO on other targets does not specify the number of LTO partitions. Recent changes made this default to 8 on AMDGPU which had some issues with the `libc` project. The option to disable this is HIP only so I think for now we should restrict this just to HIP. I'm definitely on board with getting some more parallelism here, but I think it should probably be restricted to just offloading languages. The new driver goes through the `--target=amdgcn-amd-amdhsa` for its output, which means we'd need to forward the default somehow. --- clang/lib/Driver/ToolChains/AMDGPU.cpp | 22 ++++++++-------------- clang/test/Driver/amdgpu-toolchain.c | 16 +--------------- libc/startup/gpu/CMakeLists.txt | 10 ++-------- 3 files changed, 11 insertions(+), 37 deletions(-) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index e919f4e941f47..c754cae2f96b2 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -625,6 +625,7 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-shared"); } + llvm::errs() << JA.getOffloadingDeviceKind() << "\n"; addLinkerCompressDebugSectionsOption(getToolChain(), Args, CmdArgs); Args.AddAllArgs(CmdArgs, options::OPT_L); getToolChain().AddFilePathLibArgs(Args, CmdArgs); @@ -633,7 +634,7 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, const bool ThinLTO = (C.getDriver().getLTOMode() == LTOK_Thin); addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], ThinLTO); - if (!ThinLTO) + if (!ThinLTO && JA.getOffloadingDeviceKind() == Action::OFK_HIP) addFullLTOPartitionOption(C.getDriver(), Args, CmdArgs); } else if (Args.hasArg(options::OPT_mcpu_EQ)) { CmdArgs.push_back(Args.MakeArgString( @@ -712,14 +713,12 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D, } static unsigned getFullLTOPartitions(const Driver &D, const ArgList &Args) { - const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ); - // In the absence of an option, use 8 as the default. - if (!A) - return 8; int Value = 0; - if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) { + StringRef A = Args.getLastArgValue(options::OPT_flto_partitions_EQ, "8"); + if (A.getAsInteger(10, Value) || (Value < 1)) { + Arg *Arg = Args.getLastArg(options::OPT_flto_partitions_EQ); D.Diag(diag::err_drv_invalid_int_value) - << A->getAsString(Args) << A->getValue(); + << Arg->getAsString(Args) << Arg->getValue(); return 1; } @@ -729,13 +728,8 @@ static unsigned getFullLTOPartitions(const Driver &D, const ArgList &Args) { void amdgpu::addFullLTOPartitionOption(const Driver &D, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) { - // TODO: Should this be restricted to fgpu-rdc only ? Currently we'll - // also do it for non gpu-rdc LTO - - if (unsigned NumParts = getFullLTOPartitions(D, Args); NumParts > 1) { - CmdArgs.push_back( - Args.MakeArgString("--lto-partitions=" + Twine(NumParts))); - } + CmdArgs.push_back(Args.MakeArgString("--lto-partitions=" + + Twine(getFullLTOPartitions(D, Args)))); } /// AMDGPU Toolchain diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c index 6617108e59fcf..20656f1fadeb7 100644 --- a/clang/test/Driver/amdgpu-toolchain.c +++ b/clang/test/Driver/amdgpu-toolchain.c @@ -21,7 +21,7 @@ // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \ // RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=LTO %s // LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions" -// LTO: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions={{[0-9]+}}"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack" +// LTO: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack" // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \ // RUN: -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s @@ -38,17 +38,3 @@ // RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -stdlib -startfiles \ // RUN: -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=STARTUP %s // STARTUP: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o" - -// Check --flto-partitions - -// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \ -// RUN: -L. -flto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s -// LTO_PARTS: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions=42" - -// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \ -// RUN: -L. -flto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s -// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a' - -// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \ -// RUN: -L. -flto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s -// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0' diff --git a/libc/startup/gpu/CMakeLists.txt b/libc/startup/gpu/CMakeLists.txt index a30e6f07a66a3..fa326ef46a9d1 100644 --- a/libc/startup/gpu/CMakeLists.txt +++ b/libc/startup/gpu/CMakeLists.txt @@ -33,14 +33,8 @@ function(add_startup_object name) set_target_properties(${fq_target_name}.exe PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR} RUNTIME_OUTPUT_NAME ${name}.o) - # FIXME: A bug in the AMDGPU LTO pass is incorrectly removing the kernels. - if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) - target_link_options(${fq_target_name}.exe PRIVATE - "-r" "-nostdlib" "-flto" "-Wl,--lto-emit-llvm") - else() - target_link_options(${fq_target_name}.exe PRIVATE - "-r" "-nostdlib" "-Wl,--lto-emit-llvm") - endif() + target_link_options(${fq_target_name}.exe PRIVATE + "-r" "-nostdlib" "-flto" "-Wl,--lto-emit-llvm") endif() endfunction() >From 05c09d3e638e458f22a44180a995dd21f47c73a9 Mon Sep 17 00:00:00 2001 From: Joseph Huber <hube...@outlook.com> Date: Wed, 26 Mar 2025 16:52:46 -0500 Subject: [PATCH 2/3] test --- clang/test/Driver/hip-toolchain-rdc.hip | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip index 9015702e3211a..13b50353fc20c 100644 --- a/clang/test/Driver/hip-toolchain-rdc.hip +++ b/clang/test/Driver/hip-toolchain-rdc.hip @@ -161,3 +161,17 @@ // output the executable // LNX: [[LD:".*ld.*"]] {{.*}}"-o" "a.out" {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]] [[OBJBUNDLE]] // MSVC: [[LD:".*lld-link.*"]] {{.*}}"-out:a.exe" {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]] [[OBJBUNDLE]] + +// Check --flto-partitions + +// RUN: %clang -### -fgpu-rdc --offload-arch=gfx90a -nogpulib -nogpuinc \ +// RUN: -L. -flto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s +// LTO_PARTS: lld{{.*}}"--lto-partitions=42" + +// RUN: not %clang -### -fgpu-rdc --offload-arch=gfx90a -nogpulib -nogpuinc \ +// RUN: -L. -flto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s +// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a' + +// RUN: not %clang -### -fgpu-rdc --offload-arch=gfx90a -nogpulib -nogpuinc \ +// RUN: -L. -flto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s +// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0' >From 00abe864e160f093927f74b9023973fa2674f4bd Mon Sep 17 00:00:00 2001 From: Joseph Huber <hube...@outlook.com> Date: Wed, 26 Mar 2025 17:00:14 -0500 Subject: [PATCH 3/3] update --- clang/test/Driver/hip-toolchain-rdc.hip | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip index 13b50353fc20c..d5119e71a3569 100644 --- a/clang/test/Driver/hip-toolchain-rdc.hip +++ b/clang/test/Driver/hip-toolchain-rdc.hip @@ -165,13 +165,17 @@ // Check --flto-partitions // RUN: %clang -### -fgpu-rdc --offload-arch=gfx90a -nogpulib -nogpuinc \ -// RUN: -L. -flto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s +// RUN: -L. -foffload-lto %s 2>&1 | FileCheck -check-prefix=LTO_DEFAULT %s +// LTO_DEFAULT: lld{{.*}}"--lto-partitions=8" + +// RUN: %clang -### -fgpu-rdc --offload-arch=gfx90a -nogpulib -nogpuinc \ +// RUN: -L. -foffload-lto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s // LTO_PARTS: lld{{.*}}"--lto-partitions=42" // RUN: not %clang -### -fgpu-rdc --offload-arch=gfx90a -nogpulib -nogpuinc \ -// RUN: -L. -flto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s +// RUN: -L. -foffload-lto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s // LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a' // RUN: not %clang -### -fgpu-rdc --offload-arch=gfx90a -nogpulib -nogpuinc \ -// RUN: -L. -flto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s +// RUN: -L. -foffload-lto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s // LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0' _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits