https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/119009
Only OpenCL supports 2d and 3d dispatches, the other languages Y and Z dimensions are always 1. Some of the generated OpenMP functions don't seem to get the correct attributes. The kernels do, but the callable __omp_offloading functions are missing them for some reason. >From 7c9c6c6897c74df7427c20b0e1305efaa2028c8c Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Fri, 6 Dec 2024 13:21:41 -0500 Subject: [PATCH] clang/AMDGPU: Set amdgpu-max-num-workgroups to disable Y/Z by default Only OpenCL supports 2d and 3d dispatches, the other languages Y and Z dimensions are always 1. Some of the generated OpenMP functions don't seem to get the correct attributes. The kernels do, but the callable __omp_offloading functions are missing them for some reason. --- clang/include/clang/Basic/LangOptions.h | 4 ++ clang/lib/CodeGen/Targets/AMDGPU.cpp | 56 +++++++++++-------- clang/test/CodeGenHIP/default-attributes.hip | 4 +- clang/test/OpenMP/amdgcn-attributes.cpp | 12 ++-- .../amdgcn_target_global_constructor.cpp | 2 +- 5 files changed, 45 insertions(+), 33 deletions(-) diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 949c8f5d448bcf..d5532eec0a683e 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -706,6 +706,10 @@ class LangOptions : public LangOptionsBase { return OpenCL || CUDA; } + /// Return true if the dispatch size for an offload language only uses one + /// dimension. + bool gridSizeIsOneDimension() const { return CUDA || HIP || OpenMP; } + /// Return the OpenCL C or C++ version as a VersionTuple. VersionTuple getOpenCLVersionTuple() const; diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 56ad0503a11ab2..904e03b3cc7182 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -377,29 +377,6 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( if (NumVGPR != 0) F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); } - - if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) { - uint32_t X = Attr->getMaxNumWorkGroupsX() - ->EvaluateKnownConstInt(M.getContext()) - .getExtValue(); - // Y and Z dimensions default to 1 if not specified - uint32_t Y = Attr->getMaxNumWorkGroupsY() - ? Attr->getMaxNumWorkGroupsY() - ->EvaluateKnownConstInt(M.getContext()) - .getExtValue() - : 1; - uint32_t Z = Attr->getMaxNumWorkGroupsZ() - ? Attr->getMaxNumWorkGroupsZ() - ->EvaluateKnownConstInt(M.getContext()) - .getExtValue() - : 1; - - llvm::SmallString<32> AttrVal; - llvm::raw_svector_ostream OS(AttrVal); - OS << X << ',' << Y << ',' << Z; - - F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str()); - } } /// Emits control constants used to change per-architecture behaviour in the @@ -450,9 +427,40 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes( if (!F) return; + // TODO: Use AttrBuilder const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D); - if (FD) + const AMDGPUMaxNumWorkGroupsAttr *MaxNumWorkGroupsAttr = nullptr; + if (FD) { setFunctionDeclAttributes(FD, F, M); + MaxNumWorkGroupsAttr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>(); + } + + if (MaxNumWorkGroupsAttr) { + const auto *Attr = MaxNumWorkGroupsAttr; + uint32_t X = Attr->getMaxNumWorkGroupsX() + ->EvaluateKnownConstInt(M.getContext()) + .getExtValue(); + // Y and Z dimensions default to 1 if not specified + uint32_t Y = Attr->getMaxNumWorkGroupsY() + ? Attr->getMaxNumWorkGroupsY() + ->EvaluateKnownConstInt(M.getContext()) + .getExtValue() + : 1; + uint32_t Z = Attr->getMaxNumWorkGroupsZ() + ? Attr->getMaxNumWorkGroupsZ() + ->EvaluateKnownConstInt(M.getContext()) + .getExtValue() + : 1; + + llvm::SmallString<32> AttrVal; + llvm::raw_svector_ostream OS(AttrVal); + OS << X << ',' << Y << ',' << Z; + + F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str()); + } else if (M.getLangOpts().gridSizeIsOneDimension()) { + // If the language only has 1D dispatches, disable Y/Z by default. + F->addFnAttr("amdgpu-max-num-workgroups", "4294967295,1,1"); + } if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) F->addFnAttr("amdgpu-ieee", "false"); diff --git a/clang/test/CodeGenHIP/default-attributes.hip b/clang/test/CodeGenHIP/default-attributes.hip index 1b53ebec9b5821..1a2cc42828c2f6 100644 --- a/clang/test/CodeGenHIP/default-attributes.hip +++ b/clang/test/CodeGenHIP/default-attributes.hip @@ -34,9 +34,9 @@ __global__ void kernel() { extern_func(); } //. -// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } // OPTNONE: attributes #[[ATTR3]] = { convergent nounwind } //. // OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/clang/test/OpenMP/amdgcn-attributes.cpp b/clang/test/OpenMP/amdgcn-attributes.cpp index 2c9e16a4f5098e..270cc225d05da2 100644 --- a/clang/test/OpenMP/amdgcn-attributes.cpp +++ b/clang/test/OpenMP/amdgcn-attributes.cpp @@ -31,10 +31,10 @@ int callable(int x) { return x + 1; } -// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } -// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } -// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } +// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } -// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/clang/test/OpenMP/amdgcn_target_global_constructor.cpp b/clang/test/OpenMP/amdgcn_target_global_constructor.cpp index 9f1e68d4ea0fec..ffde39479761c4 100644 --- a/clang/test/OpenMP/amdgcn_target_global_constructor.cpp +++ b/clang/test/OpenMP/amdgcn_target_global_constructor.cpp @@ -98,7 +98,7 @@ S A; // //. // CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // CHECK: attributes #[[ATTR3]] = { convergent } // CHECK: attributes #[[ATTR4]] = { convergent nounwind } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits