https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/156686
>From 894e8b7515b5de233ec2c6f2f4329ab726c112de Mon Sep 17 00:00:00 2001 From: Shilei Tian <[email protected]> Date: Wed, 3 Sep 2025 10:47:52 -0400 Subject: [PATCH 01/12] [Clang][HIP][CUDA] Add `__cluster_dims__` and `__no_cluster__` attribute This PR adds basic frontend support for `__cluster_dims__` and `__no_cluster__` attribute. Co-authored-by: Yaxun (Sam) Liu <[email protected]> Co-authored-by: Jay Foad <[email protected]> --- clang/include/clang/Basic/Attr.td | 17 +++ .../clang/Basic/DiagnosticSemaKinds.td | 8 ++ clang/include/clang/Sema/Sema.h | 8 ++ clang/lib/CodeGen/Targets/AMDGPU.cpp | 26 ++++ .../lib/Headers/__clang_hip_runtime_wrapper.h | 2 + clang/lib/Sema/SemaDeclAttr.cpp | 130 ++++++++++++++++++ .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 37 +++++ clang/test/CodeGenCUDA/Inputs/cuda.h | 4 + clang/test/CodeGenCUDA/cluster_dims.cu | 38 +++++ ...a-attribute-supported-attributes-list.test | 2 + clang/test/SemaCUDA/Inputs/cuda.h | 2 + clang/test/SemaCUDA/cluster_dims.cu | 64 +++++++++ 12 files changed, 338 insertions(+) create mode 100644 clang/test/CodeGenCUDA/cluster_dims.cu create mode 100644 clang/test/SemaCUDA/cluster_dims.cu diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 22e60aa9fe312..73461eb318b44 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1572,6 +1572,23 @@ def HIPManaged : InheritableAttr { let Documentation = [HIPManagedAttrDocs]; } +def CUDAClusterDims : InheritableAttr { + let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">]; + let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>]; + let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; + let LangOpts = [CUDA]; + let Documentation = [Undocumented]; +} + +def CUDANoCluster : InheritableAttr { + let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">]; + let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; + let LangOpts = [CUDA]; + let Documentation = [Undocumented]; +} + +def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>; + def CUDAInvalidTarget : InheritableAttr { let Spellings = []; let Subjects = SubjectList<[Function]>; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 12fd7b08683e1..dc33fd49d87b3 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13070,6 +13070,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning< "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " "%1 attribute">, InGroup<IgnoredAttributes>; +def err_cuda_cluster_attr_not_supported : Error< + "%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture" +>; + +def err_cuda_cluster_dims_too_large : Error< + "only a maximum of %0 thread blocks in a cluster is supported" +>; + // VTable pointer authentication errors def err_non_polymorphic_vtable_pointer_auth : Error< "cannot set vtable pointer authentication on monomorphic type %0">; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 37598f8530c09..10a356c92d9bd 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -5010,6 +5010,14 @@ class Sema final : public SemaBase { void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks); + /// Add a cluster_dims attribute to a particular declaration. + CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI, + Expr *X, Expr *Y, Expr *Z); + void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X, + Expr *Y, Expr *Z); + /// Add a no_cluster attribute to a particular declaration. + void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI); + enum class RetainOwnershipKind { NS, CF, OS }; UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI, diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 0fcbf7e458a34..48855ce485f91 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -342,6 +342,9 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D, void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const { + llvm::StringMap<bool> TargetFetureMap; + M.getContext().getFunctionFeatureMap(TargetFetureMap, FD); + const auto *ReqdWGS = M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr; const bool IsOpenCLKernel = @@ -402,6 +405,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str()); } + + if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) { + uint32_t X = + Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue(); + uint32_t Y = + Attr->getY() + ? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue() + : 1; + uint32_t Z = + Attr->getZ() + ? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue() + : 1; + + llvm::SmallString<32> AttrVal; + llvm::raw_svector_ostream OS(AttrVal); + OS << X << ',' << Y << ',' << Z; + F->addFnAttr("amdgpu-cluster-dims", AttrVal.str()); + } + + // OpenCL doesn't support cluster feature. + if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) || + FD->getAttr<CUDANoClusterAttr>()) + F->addFnAttr("amdgpu-cluster-dims", "0,0,0"); } void AMDGPUTargetCodeGenInfo::setTargetAttributes( diff --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h index da1e39ac7270e..fb0ece96e1418 100644 --- a/clang/lib/Headers/__clang_hip_runtime_wrapper.h +++ b/clang/lib/Headers/__clang_hip_runtime_wrapper.h @@ -25,6 +25,8 @@ #define __constant__ __attribute__((constant)) #define __managed__ __attribute__((managed)) +#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__))) + #if !defined(__cplusplus) || __cplusplus < 201103L #define nullptr NULL; #endif diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index e6f8748db7644..b58570ac0975e 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -5676,6 +5676,130 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) { AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr); } +static std::pair<Expr *, int> +makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL, + const unsigned Idx) { + if (S.DiagnoseUnexpandedParameterPack(E)) + return {nullptr, 0}; + + // Accept template arguments for now as they depend on something else. + // We'll get to check them when they eventually get instantiated. + if (E->isValueDependent()) + return {E, 1}; + + std::optional<llvm::APSInt> I = llvm::APSInt(64); + if (!(I = E->getIntegerConstantExpr(S.Context))) { + S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type) + << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange(); + return {nullptr, 0}; + } + // Make sure we can fit it in 4 bits. + if (!I->isIntN(4)) { + S.Diag(E->getExprLoc(), diag::err_ice_too_large) + << toString(*I, 10, false) << 4 << /* Unsigned */ 1; + return {nullptr, 0}; + } + if (*I < 0) + S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative) + << &AL << Idx << E->getSourceRange(); + + // We may need to perform implicit conversion of the argument. + InitializedEntity Entity = InitializedEntity::InitializeParameter( + S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false); + ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E); + assert(!ValArg.isInvalid() && + "Unexpected PerformCopyInitialization() failure."); + + return {ValArg.getAs<Expr>(), I->getZExtValue()}; +} + +CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI, + Expr *X, Expr *Y, Expr *Z) { + CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z); + + int ValX = 1; + int ValY = 1; + int ValZ = 1; + + std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0); + if (!X) + return nullptr; + + if (Y) { + std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1); + if (!Y) + return nullptr; + } + + if (Z) { + std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2); + if (!Z) + return nullptr; + } + + int FlatDim = ValX * ValY * ValZ; + auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo()) + ? Context.getAuxTargetInfo()->getTriple() + : Context.getTargetInfo().getTriple(); + int MaxDim = 1; + if (TT.isNVPTX()) + MaxDim = 8; + else if (TT.isAMDGPU()) + MaxDim = 16; + else + return nullptr; + + // A maximum of 8 thread blocks in a cluster is supported as a portable + // cluster size in CUDA. The number is 16 for AMDGPU. + if (FlatDim > MaxDim) { + Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim; + return nullptr; + } + + return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z); +} + +void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X, + Expr *Y, Expr *Z) { + if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z)) + D->addAttr(Attr); +} + +void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) { + if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI)) + D->addAttr(Attr); +} + +static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) { + auto &TTI = S.Context.getTargetInfo(); + auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU); + if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) || + (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) { + S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0; + return; + } + + if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) || + !AL.checkAtMostNumArgs(S, /*Num=*/3)) + return; + + S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0), + AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr, + AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr); +} + +static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) { + auto &TTI = S.Context.getTargetInfo(); + auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU); + if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) || + (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) { + S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1; + return; + } + + S.addNoClusterAttr(D, AL); +} + static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (!AL.isArgIdent(0)) { @@ -7141,6 +7265,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_CUDALaunchBounds: handleLaunchBoundsAttr(S, D, AL); break; + case ParsedAttr::AT_CUDAClusterDims: + handleClusterDimsAttr(S, D, AL); + break; + case ParsedAttr::AT_CUDANoCluster: + handleNoClusterAttr(S, D, AL); + break; case ParsedAttr::AT_Restrict: handleRestrictAttr(S, D, AL); break; diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 468bc1d677ac2..c1bb2ec1800f8 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -707,6 +707,38 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr( S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr); } +static void instantiateDependentCUDAClusterDimsAttr( + Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs, + const CUDAClusterDimsAttr &Attr, Decl *New) { + EnterExpressionEvaluationContext Unevaluated( + S, Sema::ExpressionEvaluationContext::ConstantEvaluated); + + Expr *XExpr = nullptr; + Expr *YExpr = nullptr; + Expr *ZExpr = nullptr; + + if (Attr.getX()) { + ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs); + if (ResultX.isUsable()) + XExpr = ResultX.getAs<Expr>(); + } + + if (Attr.getY()) { + ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs); + if (ResultY.isUsable()) + YExpr = ResultY.getAs<Expr>(); + } + + if (Attr.getZ()) { + ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs); + if (ResultZ.isUsable()) + ZExpr = ResultZ.getAs<Expr>(); + } + + if (XExpr) + S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr); +} + // This doesn't take any template parameters, but we have a custom action that // needs to happen when the kernel itself is instantiated. We need to run the // ItaniumMangler to mark the names required to name this kernel. @@ -921,6 +953,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs, *this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New); } + if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) { + instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs, + *CUDAClusterDims, New); + } + if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) { instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr, New); diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h index dc85eae0c5178..e7ad784335027 100644 --- a/clang/test/CodeGenCUDA/Inputs/cuda.h +++ b/clang/test/CodeGenCUDA/Inputs/cuda.h @@ -13,6 +13,8 @@ #endif #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) #define __grid_constant__ __attribute__((grid_constant)) +#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__))) +#define __no_cluster__ __attribute__((no_cluster)) #else #define __constant__ #define __device__ @@ -22,6 +24,8 @@ #define __managed__ #define __launch_bounds__(...) #define __grid_constant__ +#define __cluster_dims__(...) +#define __no_cluster__ #endif struct dim3 { diff --git a/clang/test/CodeGenCUDA/cluster_dims.cu b/clang/test/CodeGenCUDA/cluster_dims.cu new file mode 100644 index 0000000000000..00635e3572a7f --- /dev/null +++ b/clang/test/CodeGenCUDA/cluster_dims.cu @@ -0,0 +1,38 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s + +#include "Inputs/cuda.h" + +const int constint = 4; + +// HOST-NOT: "amdgpu-cluster-dims" + +// CHECK: "amdgpu-cluster-dims"="2,2,2" +__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} + +// CHECK: "amdgpu-cluster-dims"="2,2,1" +__global__ void __cluster_dims__(2, 2) test_literal_2d() {} + +// CHECK: "amdgpu-cluster-dims"="4,1,1" +__global__ void __cluster_dims__(4) test_literal_1d() {} + +// CHECK: "amdgpu-cluster-dims"="4,2,1" +__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {} + +// CHECK: "amdgpu-cluster-dims"="0,0,0" +__global__ void __no_cluster__ test_no_cluster() {} + +// CHECK: "amdgpu-cluster-dims"="7,1,1" +template<unsigned a> +__global__ void __cluster_dims__(a) test_template_1d() {} +template __global__ void test_template_1d<7>(); + +// CHECK: "amdgpu-cluster-dims"="2,6,1" +template<unsigned a, unsigned b> +__global__ void __cluster_dims__(a, b) test_template_2d() {} +template __global__ void test_template_2d<2, 6>(); + +// CHECK: "amdgpu-cluster-dims"="1,2,3" +template<unsigned a, unsigned b, unsigned c> +__global__ void __cluster_dims__(a, b, c) test_template_3d() {} +template __global__ void test_template_3d<1, 2, 3>(); diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test index 73d4cb1769ed5..693d54159804e 100644 --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -35,6 +35,7 @@ // CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function) // CHECK-NEXT: CPUDispatch (SubjectMatchRule_function) // CHECK-NEXT: CPUSpecific (SubjectMatchRule_function) +// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function) // CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable) // CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable) // CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record) @@ -43,6 +44,7 @@ // CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter) // CHECK-NEXT: CUDAHost (SubjectMatchRule_function) // CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType) +// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function) // CHECK-NEXT: CUDAShared (SubjectMatchRule_variable) // CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function) // CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member) diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h index 10db947d8246c..2bf45e03d91c7 100644 --- a/clang/test/SemaCUDA/Inputs/cuda.h +++ b/clang/test/SemaCUDA/Inputs/cuda.h @@ -13,6 +13,8 @@ #define __managed__ __attribute__((managed)) #define __grid_constant__ __attribute__((grid_constant)) #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) +#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__))) +#define __no_cluster__ __attribute__((no_cluster)) struct dim3 { unsigned x, y, z; diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu new file mode 100644 index 0000000000000..3cd0e0197c29b --- /dev/null +++ b/clang/test/SemaCUDA/cluster_dims.cu @@ -0,0 +1,64 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -ast-print -x hip -verify=NS,all %s +// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -fcuda-is-device -ast-print -x hip -verify=NS,all %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s +// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -target-cpu sm_90 -fcuda-is-device -ast-print -x hip -verify=cuda,common,all %s | FileCheck -check-prefixes=CHECK %s +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s + +#include "Inputs/cuda.h" + +const int constint = 4; + +// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2, 2))) void test_literal_3d() +__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}} + +// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2))) void test_literal_2d() +__global__ void __cluster_dims__(2, 2) test_literal_2d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}} + +// CHECK: __attribute__((global)) __attribute__((cluster_dims(4))) void test_literal_1d() +__global__ void __cluster_dims__(4) test_literal_1d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}} + +// CHECK: __attribute__((global)) __attribute__((cluster_dims(constint, constint / 4, 1))) void test_constant() +__global__ void __cluster_dims__(constint, constint / 4, 1) test_constant() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}} + +// CHECK: template <int x, int y, int z> void test_template() __attribute__((cluster_dims(x, y, z))) +template <int x, int y, int z> void test_template(void) __cluster_dims__(x, y, z){} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}} + +// CHECK: template <int x, int y, int z> void test_template_expr() __attribute__((cluster_dims(x + constint, y, z))) +template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x + constint, y, z) {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}} + +//NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}} +__global__ void __cluster_dims__(32, 2, 4) test_too_large_dim_0() {} // common-error {{integer constant expression evaluates to value 32 that cannot be represented in a 4-bit unsigned integer type}} + +// cuda-error@+2 {{only a maximum of 8 thread blocks in a cluster is supported}} +// amd-error@+1 {{only a maximum of 16 thread blocks in a cluster is supported}} +__global__ void __cluster_dims__(4, 4, 4) test_too_large_dim_1() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}} + +// cuda-error@+3 {{only a maximum of 8 thread blocks in a cluster is supported}} +// amd-error@+2 {{only a maximum of 16 thread blocks in a cluster is supported}} +template<unsigned a, unsigned b, unsigned c> +__global__ void __cluster_dims__(a, b, c) test_too_large_dim_template() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}} +template __global__ void test_too_large_dim_template<4, 4, 4>(); // common-note {{in instantiation of function template specialization 'test_too_large_dim_template<4U, 4U, 4U>' requested here}} + +int none_const_int = 4; + +//NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}} +__global__ void __cluster_dims__(none_const_int, 2, 4) test_non_constant_0() {} // common-error {{'cluster_dims' attribute requires parameter 0 to be an integer constant}} + +//NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}} +__global__ void __cluster_dims__(8, none_const_int / 2, 4) test_non_constant_1() {} // common-error {{'cluster_dims' attribute requires parameter 1 to be an integer constant}} + +//NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}} +__global__ void __cluster_dims__(8, 2, none_const_int / 4) test_non_constant_2() {} // common-error {{'cluster_dims' attribute requires parameter 2 to be an integer constant}} + +//NS-error@+1 {{__no_cluster__ is not supported for this GPU architecture}} +__global__ void __no_cluster__ test_no_cluster() {} + +//NS-error@+2 {{__no_cluster__ is not supported for this GPU architecture}} +//NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}} +__global__ void __no_cluster__ __cluster_dims__(2,2,2) test_have_both() {} // common-error {{'cluster_dims' and 'no_cluster' attributes are not compatible}} common-note {{conflicting attribute is here}} + +template <int... args> +__cluster_dims__(args) void test_template_variadic_args(void) {} // all-error {{expression contains unexpanded parameter pack 'args'}} + +template <int... args> +__cluster_dims__(1, args) void test_template_variadic_args_2(void) {} // all-error {{expression contains unexpanded parameter pack 'args'}} >From 118ff85603bee1b5137dc3e62093583f0dee89b3 Mon Sep 17 00:00:00 2001 From: Shilei Tian <[email protected]> Date: Wed, 3 Sep 2025 18:44:16 -0400 Subject: [PATCH 02/12] add documentation --- clang/include/clang/Basic/Attr.td | 4 ++-- clang/include/clang/Basic/AttrDocs.td | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 73461eb318b44..8353e0359565b 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1577,14 +1577,14 @@ def CUDAClusterDims : InheritableAttr { let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>]; let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; let LangOpts = [CUDA]; - let Documentation = [Undocumented]; + let Documentation = [CUDAClusterDimsAttrDoc]; } def CUDANoCluster : InheritableAttr { let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">]; let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; let LangOpts = [CUDA]; - let Documentation = [Undocumented]; + let Documentation = [CUDANoClusterAttrDoc]; } def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index e0bbda083b5cf..07e1053ddac30 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -7545,6 +7545,25 @@ A managed variable can be accessed in both device and host code. }]; } +def CUDAClusterDimsAttrDoc : Documentation { + let Category = DocCatDecl; + let Content = [{ +In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function +to set the dimensions of a thread block cluster. This allows to group multiple thread blocks into +a larger unit called a "cluster". `__cluster_dims__` defines the cluster size as ``(X, Y, Z)``, +where each value is the number of thread blocks in that dimension. + }]; +} + +def CUDANoClusterAttrDoc : Documentation { + let Category = DocCatDecl; + let Content = [{ +In CUDA/HIP programming, the ``__no_cluster__`` attribute can be applied to a kernel function to +indicate that the thread block cluster feature will not be enabled at both compile time and kernel +launch time. Note: this is a LLVM/Clang only attribute. + }]; +} + def LifetimeOwnerDocs : Documentation { let Category = DocCatDecl; let Content = [{ >From 0ae111127cc2582cfeaa4c93ce7a2209a48de930 Mon Sep 17 00:00:00 2001 From: Shilei Tian <[email protected]> Date: Wed, 10 Sep 2025 12:03:42 -0400 Subject: [PATCH 03/12] fix comments --- clang/include/clang/Basic/AttrDocs.td | 26 ++++++++++++++----- .../clang/Basic/DiagnosticSemaKinds.td | 2 +- clang/lib/Sema/SemaDeclAttr.cpp | 17 ++++++------ 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 07e1053ddac30..6a33f08eb2a05 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -7549,18 +7549,32 @@ def CUDAClusterDimsAttrDoc : Documentation { let Category = DocCatDecl; let Content = [{ In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function -to set the dimensions of a thread block cluster. This allows to group multiple thread blocks into -a larger unit called a "cluster". `__cluster_dims__` defines the cluster size as ``(X, Y, Z)``, -where each value is the number of thread blocks in that dimension. +to set the dimensions of a thread block cluster. ``__cluster_dims__`` defines the cluster size +as ``(X, Y, Z)``, where each value is the number of thread blocks in that dimension. +The ``__cluster_dims__`` and `__no_cluster__`` attributes are mutually exclusive. + +.. code:: + + __global__ __cluster_dims__(2, 1, 1) void kernel(...) { + ... + } + }]; } def CUDANoClusterAttrDoc : Documentation { let Category = DocCatDecl; let Content = [{ -In CUDA/HIP programming, the ``__no_cluster__`` attribute can be applied to a kernel function to -indicate that the thread block cluster feature will not be enabled at both compile time and kernel -launch time. Note: this is a LLVM/Clang only attribute. +In CUDA/HIP programming, the LLVM/Clang-exclusive ``__no_cluster__`` attribute can be applied to +a kernel function to indicate that the thread block cluster feature will not be enabled at both +compile time and kernel launch time. The ``__cluster_dims__`` and `__no_cluster__`` attributes +are mutually exclusive. + +.. code:: + + __global__ __no_cluster__ void kernel(...) { + ... + } }]; } diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index dc33fd49d87b3..f36cc6055a403 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13071,7 +13071,7 @@ def warn_cuda_maxclusterrank_sm_90 : Warning< "%1 attribute">, InGroup<IgnoredAttributes>; def err_cuda_cluster_attr_not_supported : Error< - "%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture" + "%0 is not supported for this GPU architecture" >; def err_cuda_cluster_dims_too_large : Error< diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index b58570ac0975e..76c3d414bfc15 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -5684,11 +5684,11 @@ makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL, // Accept template arguments for now as they depend on something else. // We'll get to check them when they eventually get instantiated. - if (E->isValueDependent()) + if (E->isInstantiationDependent()) return {E, 1}; - std::optional<llvm::APSInt> I = llvm::APSInt(64); - if (!(I = E->getIntegerConstantExpr(S.Context))) { + std::optional<llvm::APSInt> I = E->getIntegerConstantExpr(S.Context); + if (!I) { S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type) << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange(); return {nullptr, 0}; @@ -5756,7 +5756,7 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI, return nullptr; } - return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z); + return CUDAClusterDimsAttr::Create(Context, X, Y, Z, CI); } void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X, @@ -5766,8 +5766,7 @@ void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X, } void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) { - if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI)) - D->addAttr(Attr); + D->addAttr(CUDANoClusterAttr::Create(Context, CI)); } static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) { @@ -5775,7 +5774,8 @@ static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) { auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU); if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) || (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) { - S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0; + S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) + << "__cluster_dims__"; return; } @@ -5793,7 +5793,8 @@ static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) { auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU); if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) || (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) { - S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1; + S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) + << "__no_cluster__"; return; } >From 273047e03479c6865e916c8412f2415a1bd990c1 Mon Sep 17 00:00:00 2001 From: Shilei Tian <[email protected]> Date: Wed, 24 Sep 2025 13:00:38 -0400 Subject: [PATCH 04/12] minor improvements --- clang/lib/Sema/SemaDeclAttr.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 76c3d414bfc15..1f487fcbe0410 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -5680,32 +5680,33 @@ static std::pair<Expr *, int> makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL, const unsigned Idx) { if (S.DiagnoseUnexpandedParameterPack(E)) - return {nullptr, 0}; + return {}; // Accept template arguments for now as they depend on something else. // We'll get to check them when they eventually get instantiated. if (E->isInstantiationDependent()) - return {E, 1}; + return {}; std::optional<llvm::APSInt> I = E->getIntegerConstantExpr(S.Context); if (!I) { S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type) << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange(); - return {nullptr, 0}; + return {}; } // Make sure we can fit it in 4 bits. if (!I->isIntN(4)) { S.Diag(E->getExprLoc(), diag::err_ice_too_large) - << toString(*I, 10, false) << 4 << /* Unsigned */ 1; - return {nullptr, 0}; + << toString(*I, 10, false) << 4 << /*Unsigned=*/1; + return {}; } - if (*I < 0) + if (*I < 0) { S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative) << &AL << Idx << E->getSourceRange(); + } // We may need to perform implicit conversion of the argument. InitializedEntity Entity = InitializedEntity::InitializeParameter( - S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false); + S.Context, S.Context.getConstType(S.Context.IntTy), /*consume=*/false); ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E); assert(!ValArg.isInvalid() && "Unexpected PerformCopyInitialization() failure."); >From f830f188e1598f03e4d5e5e030d47510859f91a0 Mon Sep 17 00:00:00 2001 From: Shilei Tian <[email protected]> Date: Wed, 24 Sep 2025 13:26:26 -0400 Subject: [PATCH 05/12] fix an error --- clang/lib/Sema/SemaDeclAttr.cpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 1f487fcbe0410..e2eae49f219ce 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -5685,7 +5685,7 @@ makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL, // Accept template arguments for now as they depend on something else. // We'll get to check them when they eventually get instantiated. if (E->isInstantiationDependent()) - return {}; + return {E, 1}; std::optional<llvm::APSInt> I = E->getIntegerConstantExpr(S.Context); if (!I) { @@ -5704,14 +5704,8 @@ makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL, << &AL << Idx << E->getSourceRange(); } - // We may need to perform implicit conversion of the argument. - InitializedEntity Entity = InitializedEntity::InitializeParameter( - S.Context, S.Context.getConstType(S.Context.IntTy), /*consume=*/false); - ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E); - assert(!ValArg.isInvalid() && - "Unexpected PerformCopyInitialization() failure."); - - return {ValArg.getAs<Expr>(), I->getZExtValue()}; + return {ConstantExpr::Create(S.getASTContext(), E, APValue(*I)), + I->getZExtValue()}; } CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI, >From 656ed706b1b8ac85a5b6181047c2741ad6d7f018 Mon Sep 17 00:00:00 2001 From: Shilei Tian <[email protected]> Date: Thu, 25 Sep 2025 19:29:02 -0400 Subject: [PATCH 06/12] fix another comment --- clang/include/clang/Basic/Attr.td | 4 ++-- .../test/Misc/pragma-attribute-supported-attributes-list.test | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 8353e0359565b..cff5df703ce1c 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1575,14 +1575,14 @@ def HIPManaged : InheritableAttr { def CUDAClusterDims : InheritableAttr { let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">]; let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>]; - let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; + let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; let LangOpts = [CUDA]; let Documentation = [CUDAClusterDimsAttrDoc]; } def CUDANoCluster : InheritableAttr { let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">]; - let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; + let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; let LangOpts = [CUDA]; let Documentation = [CUDANoClusterAttrDoc]; } diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test index 693d54159804e..ab4153a64f028 100644 --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -35,7 +35,7 @@ // CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function) // CHECK-NEXT: CPUDispatch (SubjectMatchRule_function) // CHECK-NEXT: CPUSpecific (SubjectMatchRule_function) -// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function) +// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType) // CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable) // CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable) // CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record) @@ -44,7 +44,7 @@ // CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter) // CHECK-NEXT: CUDAHost (SubjectMatchRule_function) // CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType) -// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function) +// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType) // CHECK-NEXT: CUDAShared (SubjectMatchRule_variable) // CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function) // CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member) >From 6ecf720144ee70a22aeb966070f440a37b05e9d1 Mon Sep 17 00:00:00 2001 From: Shilei Tian <[email protected]> Date: Fri, 26 Sep 2025 00:41:39 -0400 Subject: [PATCH 07/12] more information on no_dims --- clang/include/clang/Basic/AttrDocs.td | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 6a33f08eb2a05..7b050a9fad6ff 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -7565,10 +7565,14 @@ The ``__cluster_dims__`` and `__no_cluster__`` attributes are mutually exclusive def CUDANoClusterAttrDoc : Documentation { let Category = DocCatDecl; let Content = [{ -In CUDA/HIP programming, the LLVM/Clang-exclusive ``__no_cluster__`` attribute can be applied to -a kernel function to indicate that the thread block cluster feature will not be enabled at both -compile time and kernel launch time. The ``__cluster_dims__`` and `__no_cluster__`` attributes -are mutually exclusive. +In CUDA/HIP programming, a kernel function can still be launched with the cluster feature +enabled at runtime, even without the ``__cluster_dims__`` attribute. The LLVM/Clang-exclusive +``__no_cluster__`` attribute can be applied to a kernel function to explicitly indicate that +the cluster feature will not be enabled either at compile time or at kernel launch time. This +allows the compiler to apply certain optimizations without assuming that clustering could be +enabled at runtime. It is undefined behavior to launch a kernel with the ``__no_cluster__`` +attribute if the cluster feature is enabled at runtime. The ``__cluster_dims__`` and +``__no_cluster__`` attributes are mutually exclusive. .. code:: >From ada2f0d8b48a91a294a63e64ff3c52669b608887 Mon Sep 17 00:00:00 2001 From: Shilei Tian <[email protected]> Date: Tue, 30 Sep 2025 14:37:50 -0400 Subject: [PATCH 08/12] drop `__` in `Declspec` --- clang/include/clang/Basic/Attr.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index cff5df703ce1c..22b80bcf352ae 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1573,7 +1573,7 @@ def HIPManaged : InheritableAttr { } def CUDAClusterDims : InheritableAttr { - let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">]; + let Spellings = [GNU<"cluster_dims">, Declspec<"cluster_dims">]; let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; let LangOpts = [CUDA]; @@ -1581,7 +1581,7 @@ def CUDAClusterDims : InheritableAttr { } def CUDANoCluster : InheritableAttr { - let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">]; + let Spellings = [GNU<"no_cluster">, Declspec<"no_cluster">]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; let LangOpts = [CUDA]; let Documentation = [CUDANoClusterAttrDoc]; >From 120ca45987e404bf7f912bf0779480896f8ca825 Mon Sep 17 00:00:00 2001 From: Shilei Tian <[email protected]> Date: Tue, 30 Sep 2025 17:59:55 -0400 Subject: [PATCH 09/12] fix comments --- clang/include/clang/Basic/Attr.td | 2 +- clang/lib/CodeGen/Targets/AMDGPU.cpp | 19 +++++++------------ .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 6 +++--- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 22b80bcf352ae..b0f97f745bcd9 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1574,7 +1574,7 @@ def HIPManaged : InheritableAttr { def CUDAClusterDims : InheritableAttr { let Spellings = [GNU<"cluster_dims">, Declspec<"cluster_dims">]; - let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>]; + let Args = [ExprArgument<"X">, ExprArgument<"Y", /*opt=*/1>, ExprArgument<"Z", /*opt=*/1>]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; let LangOpts = [CUDA]; let Documentation = [CUDAClusterDimsAttrDoc]; diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 48855ce485f91..ee4d2aa660269 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -407,20 +407,15 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( } if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) { - uint32_t X = - Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue(); - uint32_t Y = - Attr->getY() - ? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue() - : 1; - uint32_t Z = - Attr->getZ() - ? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue() - : 1; - + auto GetExprVal = [&](const auto &E) { + return E ? E->EvaluateKnownConstInt(M.getContext()).getExtValue() : 1; + }; + unsigned X = GetExprVal(Attr->getX()); + unsigned Y = GetExprVal(Attr->getY()); + unsigned Z = GetExprVal(Attr->getZ()); llvm::SmallString<32> AttrVal; llvm::raw_svector_ostream OS(AttrVal); - OS << X << ',' << Y << ',' << Z; + OS << X << ", " << Y << ", " << Z; F->addFnAttr("amdgpu-cluster-dims", AttrVal.str()); } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index c1bb2ec1800f8..b9e7a991cd6e6 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -720,19 +720,19 @@ static void instantiateDependentCUDAClusterDimsAttr( if (Attr.getX()) { ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs); if (ResultX.isUsable()) - XExpr = ResultX.getAs<Expr>(); + XExpr = ResultX.get(); } if (Attr.getY()) { ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs); if (ResultY.isUsable()) - YExpr = ResultY.getAs<Expr>(); + YExpr = ResultY.get(); } if (Attr.getZ()) { ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs); if (ResultZ.isUsable()) - ZExpr = ResultZ.getAs<Expr>(); + ZExpr = ResultZ.get(); } if (XExpr) >From 0a6d7675dae3f17a889d923d7a36ea34ce1f1952 Mon Sep 17 00:00:00 2001 From: Shilei Tian <[email protected]> Date: Mon, 6 Oct 2025 13:53:51 -0400 Subject: [PATCH 10/12] refine target feature lookup; fix comments --- clang/include/clang/Basic/Attr.td | 4 ++-- clang/include/clang/Basic/AttrDocs.td | 7 +++--- .../clang/Basic/DiagnosticSemaKinds.td | 2 +- clang/lib/CodeGen/Targets/AMDGPU.cpp | 8 ++++--- clang/lib/Sema/SemaDeclAttr.cpp | 24 +++++++++++-------- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 3 +-- clang/test/SemaCUDA/cluster_dims.cu | 8 +++---- 7 files changed, 31 insertions(+), 25 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index b0f97f745bcd9..eb48a0c01fd1e 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1573,7 +1573,7 @@ def HIPManaged : InheritableAttr { } def CUDAClusterDims : InheritableAttr { - let Spellings = [GNU<"cluster_dims">, Declspec<"cluster_dims">]; + let Spellings = [GNU<"cluster_dims">]; let Args = [ExprArgument<"X">, ExprArgument<"Y", /*opt=*/1>, ExprArgument<"Z", /*opt=*/1>]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; let LangOpts = [CUDA]; @@ -1581,7 +1581,7 @@ def CUDAClusterDims : InheritableAttr { } def CUDANoCluster : InheritableAttr { - let Spellings = [GNU<"no_cluster">, Declspec<"no_cluster">]; + let Spellings = [GNU<"no_cluster">]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; let LangOpts = [CUDA]; let Documentation = [CUDANoClusterAttrDoc]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 7b050a9fad6ff..2ab53208f4642 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -7549,9 +7549,10 @@ def CUDAClusterDimsAttrDoc : Documentation { let Category = DocCatDecl; let Content = [{ In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function -to set the dimensions of a thread block cluster. ``__cluster_dims__`` defines the cluster size -as ``(X, Y, Z)``, where each value is the number of thread blocks in that dimension. -The ``__cluster_dims__`` and `__no_cluster__`` attributes are mutually exclusive. +to set the dimensions of a thread block cluster, which is an optional level of hierarchy and made +up of thread blocks. ``__cluster_dims__`` defines the cluster size as ``(X, Y, Z)``, where each value +is the number of thread blocks in that dimension. The ``__cluster_dims__`` and `__no_cluster__`` +attributes are mutually exclusive. .. code:: diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index f36cc6055a403..c13cfe593c24e 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13075,7 +13075,7 @@ def err_cuda_cluster_attr_not_supported : Error< >; def err_cuda_cluster_dims_too_large : Error< - "only a maximum of %0 thread blocks in a cluster is supported" + "cluster does not support more than %0 thread blocks; %1 provided" >; // VTable pointer authentication errors diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index ee4d2aa660269..bb14d4602894b 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -415,13 +415,15 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( unsigned Z = GetExprVal(Attr->getZ()); llvm::SmallString<32> AttrVal; llvm::raw_svector_ostream OS(AttrVal); - OS << X << ", " << Y << ", " << Z; + OS << X << ',' << Y << ',' << Z; F->addFnAttr("amdgpu-cluster-dims", AttrVal.str()); } // OpenCL doesn't support cluster feature. - if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) || - FD->getAttr<CUDANoClusterAttr>()) + const TargetInfo &TTI = M.getContext().getTargetInfo(); + if ((IsOpenCLKernel && + TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters")) || + FD->hasAttr<CUDANoClusterAttr>()) F->addFnAttr("amdgpu-cluster-dims", "0,0,0"); } diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index e2eae49f219ce..6da09491bbd9a 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -5733,9 +5733,10 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI, } int FlatDim = ValX * ValY * ValZ; - auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo()) - ? Context.getAuxTargetInfo()->getTriple() - : Context.getTargetInfo().getTriple(); + const llvm::Triple TT = + (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo()) + ? Context.getAuxTargetInfo()->getTriple() + : Context.getTargetInfo().getTriple(); int MaxDim = 1; if (TT.isNVPTX()) MaxDim = 8; @@ -5747,7 +5748,8 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI, // A maximum of 8 thread blocks in a cluster is supported as a portable // cluster size in CUDA. The number is 16 for AMDGPU. if (FlatDim > MaxDim) { - Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim; + Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) + << MaxDim << FlatDim; return nullptr; } @@ -5765,10 +5767,11 @@ void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) { } static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) { - auto &TTI = S.Context.getTargetInfo(); - auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU); + const TargetInfo &TTI = S.Context.getTargetInfo(); + OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU); if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) || - (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) { + (TTI.getTriple().isAMDGPU() && + !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) { S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << "__cluster_dims__"; return; @@ -5784,10 +5787,11 @@ static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) { } static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) { - auto &TTI = S.Context.getTargetInfo(); - auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU); + const TargetInfo &TTI = S.Context.getTargetInfo(); + OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU); if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) || - (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) { + (TTI.getTriple().isAMDGPU() && + !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) { S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << "__no_cluster__"; return; diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index b9e7a991cd6e6..4717b4a44adb7 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -735,8 +735,7 @@ static void instantiateDependentCUDAClusterDimsAttr( ZExpr = ResultZ.get(); } - if (XExpr) - S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr); + S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr); } // This doesn't take any template parameters, but we have a custom action that diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu index 3cd0e0197c29b..a4797ef007eaa 100644 --- a/clang/test/SemaCUDA/cluster_dims.cu +++ b/clang/test/SemaCUDA/cluster_dims.cu @@ -29,12 +29,12 @@ template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x //NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}} __global__ void __cluster_dims__(32, 2, 4) test_too_large_dim_0() {} // common-error {{integer constant expression evaluates to value 32 that cannot be represented in a 4-bit unsigned integer type}} -// cuda-error@+2 {{only a maximum of 8 thread blocks in a cluster is supported}} -// amd-error@+1 {{only a maximum of 16 thread blocks in a cluster is supported}} +// cuda-error@+2 {{cluster does not support more than 8 thread blocks; 64 provided}} +// amd-error@+1 {{cluster does not support more than 16 thread blocks; 64 provided}} __global__ void __cluster_dims__(4, 4, 4) test_too_large_dim_1() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}} -// cuda-error@+3 {{only a maximum of 8 thread blocks in a cluster is supported}} -// amd-error@+2 {{only a maximum of 16 thread blocks in a cluster is supported}} +// cuda-error@+3 {{cluster does not support more than 8 thread blocks; 64 provided}} +// amd-error@+2 {{cluster does not support more than 16 thread blocks; 64 provided}} template<unsigned a, unsigned b, unsigned c> __global__ void __cluster_dims__(a, b, c) test_too_large_dim_template() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}} template __global__ void test_too_large_dim_template<4, 4, 4>(); // common-note {{in instantiation of function template specialization 'test_too_large_dim_template<4U, 4U, 4U>' requested here}} >From b5d4c966c1b6f546787720438248f39e95f01e83 Mon Sep 17 00:00:00 2001 From: Shilei Tian <[email protected]> Date: Thu, 16 Oct 2025 14:39:46 -0400 Subject: [PATCH 11/12] fix more comments --- .../clang/Basic/DiagnosticSemaKinds.td | 10 +++--- clang/lib/Sema/SemaDeclAttr.cpp | 32 ++++++------------- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 26 ++++----------- 3 files changed, 19 insertions(+), 49 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index c13cfe593c24e..22de85d90a3cf 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13070,13 +13070,11 @@ def warn_cuda_maxclusterrank_sm_90 : Warning< "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " "%1 attribute">, InGroup<IgnoredAttributes>; -def err_cuda_cluster_attr_not_supported : Error< - "%0 is not supported for this GPU architecture" ->; +def err_cluster_attr_not_supported : Error< + "%0 is not supported for this GPU architecture">; -def err_cuda_cluster_dims_too_large : Error< - "cluster does not support more than %0 thread blocks; %1 provided" ->; +def err_cluster_dims_too_large : Error< + "cluster does not support more than %0 thread blocks; %1 provided">; // VTable pointer authentication errors def err_non_polymorphic_vtable_pointer_auth : Error< diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 6da09491bbd9a..d6f2c47676bc0 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -5679,7 +5679,7 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) { static std::pair<Expr *, int> makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL, const unsigned Idx) { - if (S.DiagnoseUnexpandedParameterPack(E)) + if (!E || S.DiagnoseUnexpandedParameterPack(E)) return {}; // Accept template arguments for now as they depend on something else. @@ -5712,26 +5712,13 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI, Expr *X, Expr *Y, Expr *Z) { CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z); - int ValX = 1; - int ValY = 1; - int ValZ = 1; + auto [NewX, ValX] = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0); + auto [NewY, ValY] = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1); + auto [NewZ, ValZ] = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2); - std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0); - if (!X) + if (!NewX || (Y && !NewY) || (Z && !NewZ)) return nullptr; - if (Y) { - std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1); - if (!Y) - return nullptr; - } - - if (Z) { - std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2); - if (!Z) - return nullptr; - } - int FlatDim = ValX * ValY * ValZ; const llvm::Triple TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo()) @@ -5748,12 +5735,11 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI, // A maximum of 8 thread blocks in a cluster is supported as a portable // cluster size in CUDA. The number is 16 for AMDGPU. if (FlatDim > MaxDim) { - Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) - << MaxDim << FlatDim; + Diag(CI.getLoc(), diag::err_cluster_dims_too_large) << MaxDim << FlatDim; return nullptr; } - return CUDAClusterDimsAttr::Create(Context, X, Y, Z, CI); + return CUDAClusterDimsAttr::Create(Context, NewX, NewY, NewZ, CI); } void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X, @@ -5772,7 +5758,7 @@ static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) || (TTI.getTriple().isAMDGPU() && !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) { - S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) + S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported) << "__cluster_dims__"; return; } @@ -5792,7 +5778,7 @@ static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) || (TTI.getTriple().isAMDGPU() && !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) { - S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) + S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported) << "__no_cluster__"; return; } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 4717b4a44adb7..fe6d3328acbeb 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -713,27 +713,13 @@ static void instantiateDependentCUDAClusterDimsAttr( EnterExpressionEvaluationContext Unevaluated( S, Sema::ExpressionEvaluationContext::ConstantEvaluated); - Expr *XExpr = nullptr; - Expr *YExpr = nullptr; - Expr *ZExpr = nullptr; - - if (Attr.getX()) { - ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs); - if (ResultX.isUsable()) - XExpr = ResultX.get(); - } - - if (Attr.getY()) { - ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs); - if (ResultY.isUsable()) - YExpr = ResultY.get(); - } + auto SubstElt = [&S, &TemplateArgs](Expr *E) { + return E ? S.SubstExpr(E, TemplateArgs).get() : nullptr; + }; - if (Attr.getZ()) { - ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs); - if (ResultZ.isUsable()) - ZExpr = ResultZ.get(); - } + Expr *XExpr = SubstElt(Attr.getX()); + Expr *YExpr = SubstElt(Attr.getY()); + Expr *ZExpr = SubstElt(Attr.getZ()); S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr); } >From b009efb9189b69b890efccae749152cf72f8a330 Mon Sep 17 00:00:00 2001 From: Shilei Tian <[email protected]> Date: Thu, 16 Oct 2025 14:47:01 -0400 Subject: [PATCH 12/12] remove unused code --- clang/lib/CodeGen/Targets/AMDGPU.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index bb14d4602894b..16d5919d62cbb 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -342,9 +342,6 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D, void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const { - llvm::StringMap<bool> TargetFetureMap; - M.getContext().getFunctionFeatureMap(TargetFetureMap, FD); - const auto *ReqdWGS = M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr; const bool IsOpenCLKernel = _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
