[libclc] [LIBCLC] Teach prepare-builtins how to handle text based IR (PR #66993)
https://github.com/jchlanda created https://github.com/llvm/llvm-project/pull/66993 None >From c37f854eb496937d0eb017b14bd8a9accefdab80 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Thu, 21 Sep 2023 10:48:33 +0100 Subject: [PATCH] [LIBCLC] Teach prepare-builtins how to handle text based IR --- libclc/utils/prepare-builtins.cpp | 37 ++- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/libclc/utils/prepare-builtins.cpp b/libclc/utils/prepare-builtins.cpp index 550b5971913f48a..ebdbc68cfee3b3c 100644 --- a/libclc/utils/prepare-builtins.cpp +++ b/libclc/utils/prepare-builtins.cpp @@ -5,23 +5,27 @@ #include "llvm/Bitcode/ReaderWriter.h" #endif +#include "llvm/Config/llvm-config.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IRReader/IRReader.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/SourceMgr.h" #include "llvm/Support/ToolOutputFile.h" -#include "llvm/Config/llvm-config.h" +#include "llvm/Support/raw_ostream.h" #include using namespace llvm; +static ExitOnError ExitOnErr; + static cl::opt InputFilename(cl::Positional, cl::desc(""), cl::init("-")); @@ -29,6 +33,9 @@ static cl::opt OutputFilename("o", cl::desc("Output filename"), cl::value_desc("filename")); +static cl::opt TextualOut("S", cl::desc("Emit LLVM textual assembly"), +cl::init(false)); + int main(int argc, char **argv) { LLVMContext Context; llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. @@ -45,17 +52,15 @@ int main(int argc, char **argv) { ErrorMessage = ec.message(); } else { std::unique_ptr &BufferPtr = BufferOrErr.get(); - ErrorOr> ModuleOrErr = + SMDiagnostic Err; + std::unique_ptr MPtr = #if HAVE_LLVM > 0x0390 - expectedToErrorOrAndEmitErrors(Context, - parseBitcodeFile(BufferPtr.get()->getMemBufferRef(), Context)); + ExitOnErr(Expected>( + parseIR(BufferPtr.get()->getMemBufferRef(), Err, Context))); #else - parseBitcodeFile(BufferPtr.get()->getMemBufferRef(), Context); + parseIR(BufferPtr.get()->getMemBufferRef(), Err, Context); #endif - if (std::error_code ec = ModuleOrErr.getError()) -ErrorMessage = ec.message(); - - M = ModuleOrErr.get().release(); + M = MPtr.release(); } } @@ -105,14 +110,16 @@ int main(int argc, char **argv) { exit(1); } + if (TextualOut) +M->print(Out->os(), nullptr, true); + else #if HAVE_LLVM >= 0x0700 - WriteBitcodeToFile(*M, Out->os()); +WriteBitcodeToFile(*M, Out->os()); #else - WriteBitcodeToFile(M, Out->os()); +WriteBitcodeToFile(M, Out->os()); #endif // Declare success. Out->keep(); return 0; } - ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
jchlanda wrote: A friendly ping. https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
@@ -307,6 +307,14 @@ void CodeGenModule::handleCUDALaunchBoundsAttr( NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm", MinBlocks.getExtValue()); } + if (Attr->getMaxBlocks()) { +llvm::APSInt MaxBlocks(32); +MaxBlocks = Attr->getMaxBlocks()->EvaluateKnownConstInt(getContext()); +if (MaxBlocks > 0) + // Create !{, metadata !"maxclusterrank", i32 } node + NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank", jchlanda wrote: That's a good question, so `makeLaunchBoundsArgEspr` does perform a [check for negative values](https://github.com/jchlanda/llvm-project/blob/jakub/launch_bounds_maxclusterrank/clang/lib/Sema/SemaDeclAttr.cpp#L5653), but lets the value pass (unlike for the case of values > 32 bits, when it returns `nullptr`), I didn't want to change it, so catch the negative case here. https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
@@ -5662,22 +5677,39 @@ Sema::CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, return nullptr; } + if (MaxBlocks) { +// Feature '.maxclusterrank' requires .target sm_90 or higher. +auto SM = getCudaArch(Context.getTargetInfo()); +if (SM == CudaArch::UNKNOWN || SM < CudaArch::SM_90) { + Diag(MaxBlocks->getBeginLoc(), diag::warn_cuda_maxclusterrank_sm_90) + << CudaArchToString(SM) << CI << MaxBlocks->getSourceRange(); + // Ignore it by setting MaxBlocks to null; + MaxBlocks = nullptr; +} else { + MaxBlocks = makeLaunchBoundsArgExpr(*this, MaxBlocks, TmpAttr, 2); + if (MaxBlocks == nullptr) jchlanda wrote: I wanted to align with the checks that are already in the body of `Sema::CreateLaunchBoundsAttr`, but I'm with you and like to use the fact that pointers are truthy, will change them all. https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda updated https://github.com/llvm/llvm-project/pull/66496 >From 9c8caed3c8def15ccdbfdf831f36d0befed1fc84 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 15 Sep 2023 12:08:04 +0100 Subject: [PATCH 1/2] [NVPTX] Add support for maxclusterrank in launch_bounds Since SM_90 CUDA supports specifying additional argument to the launch_bounds attribute: maxBlocksPerCluster, to express the maximum number of CTAs that can be part of the cluster. See: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank and https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#launch-bounds for details. --- clang/include/clang/Basic/Attr.td | 3 +- .../clang/Basic/DiagnosticSemaKinds.td| 4 + clang/include/clang/Sema/Sema.h | 5 +- clang/lib/CodeGen/Targets/NVPTX.cpp | 12 ++- clang/lib/Parse/ParseOpenMP.cpp | 3 +- clang/lib/Sema/SemaDeclAttr.cpp | 46 +-- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 ++- clang/test/CodeGenCUDA/launch-bounds.cu | 69 clang/test/SemaCUDA/launch_bounds.cu | 4 +- clang/test/SemaCUDA/launch_bounds_sm_90.cu| 45 +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 79 +-- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 + llvm/lib/Target/NVPTX/NVPTXUtilities.h| 1 + 13 files changed, 227 insertions(+), 58 deletions(-) create mode 100644 clang/test/SemaCUDA/launch_bounds_sm_90.cu diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index c95db7e8049d47a..3c51261bd3eb081 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1267,7 +1267,8 @@ def CUDAInvalidTarget : InheritableAttr { def CUDALaunchBounds : InheritableAttr { let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">]; - let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>]; + let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>, + ExprArgument<"MaxBlocks", 1>]; let LangOpts = [CUDA]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; // An AST node is created for this attribute, but is not used by other parts diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 0ac4df8edb242f6..088e3a45c7babba 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11836,6 +11836,10 @@ def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_cuda_maxclusterrank_sm_90 : Warning< + "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "%1 attribute">, InGroup; + def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must " "have a bit size of at least %select{2|1}0">; def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit " diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 47379e00a7445e3..dca7b66da3796d9 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11051,12 +11051,13 @@ class Sema final { /// Create an CUDALaunchBoundsAttr attribute. CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, - Expr *MinBlocks); + Expr *MinBlocks, + Expr *MaxBlocks); /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular /// declaration. void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI, - Expr *MaxThreads, Expr *MinBlocks); + Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks); /// AddModeAttr - Adds a mode attribute to a particular declaration. void AddModeAttr(Decl *D, const AttributeCommonInfo &CI, IdentifierInfo *Name, diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 0d4bbd795648008..64d019a10514d60 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -296,8 +296,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr( NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx", MaxThreads.getExtValue()); - // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was - // not specified in __launch_bounds__ or if the user specified a 0 value, + // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it + // was not specified in __launch_bounds__ or if the user specified
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda resolved https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
@@ -537,59 +537,52 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const { // If the NVVM IR has some of reqntid* specified, then output // the reqntid directive, and set the unspecified ones to 1. - // If none of reqntid* is specified, don't output reqntid directive. - unsigned reqntidx, reqntidy, reqntidz; - bool specified = false; - if (!getReqNTIDx(F, reqntidx)) -reqntidx = 1; - else -specified = true; - if (!getReqNTIDy(F, reqntidy)) -reqntidy = 1; - else -specified = true; - if (!getReqNTIDz(F, reqntidz)) -reqntidz = 1; - else -specified = true; - - if (specified) -O << ".reqntid " << reqntidx << ", " << reqntidy << ", " << reqntidz + // If none of Reqntid* is specified, don't output reqntid directive. + unsigned Reqntidx, Reqntidy, Reqntidz; + Reqntidx = Reqntidy = Reqntidz = 1; + bool ReqSpecified = false; + if (getReqNTIDx(F, Reqntidx)) +ReqSpecified |= true; + if (getReqNTIDy(F, Reqntidy)) +ReqSpecified |= true; + if (getReqNTIDz(F, Reqntidz)) +ReqSpecified |= true; + + if (ReqSpecified) +O << ".reqntid " << Reqntidx << ", " << Reqntidy << ", " << Reqntidz << "\n"; // If the NVVM IR has some of maxntid* specified, then output // the maxntid directive, and set the unspecified ones to 1. // If none of maxntid* is specified, don't output maxntid directive. - unsigned maxntidx, maxntidy, maxntidz; - specified = false; - if (!getMaxNTIDx(F, maxntidx)) -maxntidx = 1; - else -specified = true; - if (!getMaxNTIDy(F, maxntidy)) -maxntidy = 1; - else -specified = true; - if (!getMaxNTIDz(F, maxntidz)) -maxntidz = 1; - else -specified = true; - - if (specified) -O << ".maxntid " << maxntidx << ", " << maxntidy << ", " << maxntidz + unsigned Maxntidx, Maxntidy, Maxntidz; + Maxntidx = Maxntidy = Maxntidz = 1; + bool MaxSpecified = false; + if (getMaxNTIDx(F, Maxntidx)) +MaxSpecified |= true; + if (!getMaxNTIDy(F, Maxntidy)) +MaxSpecified |= true; + if (!getMaxNTIDz(F, Maxntidz)) +MaxSpecified |= true; jchlanda wrote: Yes, excellent spot! https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda updated https://github.com/llvm/llvm-project/pull/66496 >From 9c8caed3c8def15ccdbfdf831f36d0befed1fc84 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 15 Sep 2023 12:08:04 +0100 Subject: [PATCH 1/3] [NVPTX] Add support for maxclusterrank in launch_bounds Since SM_90 CUDA supports specifying additional argument to the launch_bounds attribute: maxBlocksPerCluster, to express the maximum number of CTAs that can be part of the cluster. See: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank and https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#launch-bounds for details. --- clang/include/clang/Basic/Attr.td | 3 +- .../clang/Basic/DiagnosticSemaKinds.td| 4 + clang/include/clang/Sema/Sema.h | 5 +- clang/lib/CodeGen/Targets/NVPTX.cpp | 12 ++- clang/lib/Parse/ParseOpenMP.cpp | 3 +- clang/lib/Sema/SemaDeclAttr.cpp | 46 +-- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 ++- clang/test/CodeGenCUDA/launch-bounds.cu | 69 clang/test/SemaCUDA/launch_bounds.cu | 4 +- clang/test/SemaCUDA/launch_bounds_sm_90.cu| 45 +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 79 +-- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 + llvm/lib/Target/NVPTX/NVPTXUtilities.h| 1 + 13 files changed, 227 insertions(+), 58 deletions(-) create mode 100644 clang/test/SemaCUDA/launch_bounds_sm_90.cu diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index c95db7e8049d47a..3c51261bd3eb081 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1267,7 +1267,8 @@ def CUDAInvalidTarget : InheritableAttr { def CUDALaunchBounds : InheritableAttr { let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">]; - let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>]; + let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>, + ExprArgument<"MaxBlocks", 1>]; let LangOpts = [CUDA]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; // An AST node is created for this attribute, but is not used by other parts diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 0ac4df8edb242f6..088e3a45c7babba 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11836,6 +11836,10 @@ def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_cuda_maxclusterrank_sm_90 : Warning< + "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "%1 attribute">, InGroup; + def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must " "have a bit size of at least %select{2|1}0">; def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit " diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 47379e00a7445e3..dca7b66da3796d9 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11051,12 +11051,13 @@ class Sema final { /// Create an CUDALaunchBoundsAttr attribute. CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, - Expr *MinBlocks); + Expr *MinBlocks, + Expr *MaxBlocks); /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular /// declaration. void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI, - Expr *MaxThreads, Expr *MinBlocks); + Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks); /// AddModeAttr - Adds a mode attribute to a particular declaration. void AddModeAttr(Decl *D, const AttributeCommonInfo &CI, IdentifierInfo *Name, diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 0d4bbd795648008..64d019a10514d60 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -296,8 +296,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr( NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx", MaxThreads.getExtValue()); - // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was - // not specified in __launch_bounds__ or if the user specified a 0 value, + // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it + // was not specified in __launch_bounds__ or if the user specified
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda resolved https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda resolved https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[libclc] [LIBCLC] Teach prepare-builtins how to handle text based IR (PR #66993)
@@ -5,30 +5,37 @@ #include "llvm/Bitcode/ReaderWriter.h" #endif +#include "llvm/Config/llvm-config.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IRReader/IRReader.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/SourceMgr.h" #include "llvm/Support/ToolOutputFile.h" -#include "llvm/Config/llvm-config.h" +#include "llvm/Support/raw_ostream.h" #include using namespace llvm; +static ExitOnError ExitOnErr; + static cl::opt InputFilename(cl::Positional, cl::desc(""), cl::init("-")); static cl::opt OutputFilename("o", cl::desc("Output filename"), cl::value_desc("filename")); +static cl::opt TextualOut("S", cl::desc("Emit LLVM textual assembly"), jchlanda wrote: The default is to output the binary IR, regardless of the input format. And the `S` switch overrides the input type, so providing the tool with binary IR, while specifying `-S` results in textual IR output. https://github.com/llvm/llvm-project/pull/66993 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda resolved https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda updated https://github.com/llvm/llvm-project/pull/66496 >From 9c8caed3c8def15ccdbfdf831f36d0befed1fc84 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 15 Sep 2023 12:08:04 +0100 Subject: [PATCH 1/4] [NVPTX] Add support for maxclusterrank in launch_bounds Since SM_90 CUDA supports specifying additional argument to the launch_bounds attribute: maxBlocksPerCluster, to express the maximum number of CTAs that can be part of the cluster. See: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank and https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#launch-bounds for details. --- clang/include/clang/Basic/Attr.td | 3 +- .../clang/Basic/DiagnosticSemaKinds.td| 4 + clang/include/clang/Sema/Sema.h | 5 +- clang/lib/CodeGen/Targets/NVPTX.cpp | 12 ++- clang/lib/Parse/ParseOpenMP.cpp | 3 +- clang/lib/Sema/SemaDeclAttr.cpp | 46 +-- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 ++- clang/test/CodeGenCUDA/launch-bounds.cu | 69 clang/test/SemaCUDA/launch_bounds.cu | 4 +- clang/test/SemaCUDA/launch_bounds_sm_90.cu| 45 +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 79 +-- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 + llvm/lib/Target/NVPTX/NVPTXUtilities.h| 1 + 13 files changed, 227 insertions(+), 58 deletions(-) create mode 100644 clang/test/SemaCUDA/launch_bounds_sm_90.cu diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index c95db7e8049d47a..3c51261bd3eb081 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1267,7 +1267,8 @@ def CUDAInvalidTarget : InheritableAttr { def CUDALaunchBounds : InheritableAttr { let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">]; - let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>]; + let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>, + ExprArgument<"MaxBlocks", 1>]; let LangOpts = [CUDA]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; // An AST node is created for this attribute, but is not used by other parts diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 0ac4df8edb242f6..088e3a45c7babba 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11836,6 +11836,10 @@ def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_cuda_maxclusterrank_sm_90 : Warning< + "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "%1 attribute">, InGroup; + def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must " "have a bit size of at least %select{2|1}0">; def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit " diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 47379e00a7445e3..dca7b66da3796d9 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11051,12 +11051,13 @@ class Sema final { /// Create an CUDALaunchBoundsAttr attribute. CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, - Expr *MinBlocks); + Expr *MinBlocks, + Expr *MaxBlocks); /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular /// declaration. void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI, - Expr *MaxThreads, Expr *MinBlocks); + Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks); /// AddModeAttr - Adds a mode attribute to a particular declaration. void AddModeAttr(Decl *D, const AttributeCommonInfo &CI, IdentifierInfo *Name, diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 0d4bbd795648008..64d019a10514d60 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -296,8 +296,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr( NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx", MaxThreads.getExtValue()); - // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was - // not specified in __launch_bounds__ or if the user specified a 0 value, + // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it + // was not specified in __launch_bounds__ or if the user specified
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
@@ -5607,6 +5607,21 @@ bool Sema::CheckRegparmAttr(const ParsedAttr &AL, unsigned &numParams) { return false; } +// Helper to get CudaArch. +static CudaArch getCudaArch(const TargetInfo &TI) { jchlanda wrote: Is that the kind of thing you had in mind: ```diff diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h index 6fa0b8df97d7..20d76b702a94 100644 --- a/clang/lib/Basic/Targets/NVPTX.h +++ b/clang/lib/Basic/Targets/NVPTX.h @@ -181,6 +181,8 @@ public: bool hasBitIntType() const override { return true; } bool hasBFloat16Type() const override { return true; } + + CudaArch getGPU() const { return GPU; } }; } // namespace targets } // namespace clang diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index c4ecaec7728b..636bb0694d36 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -10,6 +10,7 @@ // //===--===// +#include "../Basic/Targets/NVPTX.h" #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" #include "clang/AST/ASTMutationListener.h" @@ -5609,17 +5610,7 @@ bool Sema::CheckRegparmAttr(const ParsedAttr &AL, unsigned &numParams) { // Helper to get CudaArch. static CudaArch getCudaArch(const TargetInfo &TI) { - if (!TI.hasFeature("ptx")) { -return CudaArch::UNKNOWN; - } - for (const auto &Feature : TI.getTargetOpts().FeatureMap) { -if (Feature.getValue()) { - CudaArch Arch = StringToCudaArch(Feature.getKey()); - if (Arch != CudaArch::UNKNOWN) -return Arch; -} - } - return CudaArch::UNKNOWN; + return static_cast(&TI)->getGPU(); } // Checks whether an argument of launch_bounds attribute is ``` https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
@@ -11836,6 +11836,10 @@ def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_cuda_maxclusterrank_sm_90 : Warning< + "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "%1 attribute">, InGroup; jchlanda wrote: The whole thing, this is analogous to how we currently handle: ``` __launch_bounds__(128, -2) ``` we issue a warning: ``` /home/dev/llvm/clang/test/SemaCUDA/launch_bounds_running_test.cu:5:24: warning: 'launch_bounds' attribute parameter 1 is negative and will be ignored [-Wcuda-compat] 5 | __launch_bounds__(128, -2) void Test2Args(void); |^~ /home/dev/llvm/clang/test/SemaCUDA/Inputs/cuda.h:14:61: note: expanded from macro '__launch_bounds__' 14 | #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) | ^~~ 1 warning generated when compiling for host. ``` vs max cluster rank: ``` /home/dev/llvm/clang/test/SemaCUDA/launch_bounds_running_test.cu:5:27: warning: 'launch_bounds' attribute parameter 2 is negative and will be ignored [-Wcuda-compat] 5 | __launch_bounds__(128, 2, -8) void Test2Args(void); | ^~ /home/dev/llvm/clang/test/SemaCUDA/Inputs/cuda.h:14:61: note: expanded from macro '__launch_bounds__' 14 | #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) | ^~~ 1 warning generated when compiling for host. ``` and the resulting asm contains neither of the directives. https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
@@ -537,59 +537,46 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const { // If the NVVM IR has some of reqntid* specified, then output // the reqntid directive, and set the unspecified ones to 1. - // If none of reqntid* is specified, don't output reqntid directive. - unsigned reqntidx, reqntidy, reqntidz; - bool specified = false; - if (!getReqNTIDx(F, reqntidx)) -reqntidx = 1; - else -specified = true; - if (!getReqNTIDy(F, reqntidy)) -reqntidy = 1; - else -specified = true; - if (!getReqNTIDz(F, reqntidz)) -reqntidz = 1; - else -specified = true; - - if (specified) -O << ".reqntid " << reqntidx << ", " << reqntidy << ", " << reqntidz + // If none of Reqntid* is specified, don't output reqntid directive. + unsigned Reqntidx, Reqntidy, Reqntidz; + Reqntidx = Reqntidy = Reqntidz = 1; + bool ReqSpecified = false; + ReqSpecified |= getReqNTIDx(F, Reqntidx); + ReqSpecified |= getReqNTIDy(F, Reqntidy); + ReqSpecified |= getReqNTIDz(F, Reqntidz); + + if (ReqSpecified) +O << ".reqntid " << Reqntidx << ", " << Reqntidy << ", " << Reqntidz << "\n"; // If the NVVM IR has some of maxntid* specified, then output // the maxntid directive, and set the unspecified ones to 1. // If none of maxntid* is specified, don't output maxntid directive. - unsigned maxntidx, maxntidy, maxntidz; - specified = false; - if (!getMaxNTIDx(F, maxntidx)) -maxntidx = 1; - else -specified = true; - if (!getMaxNTIDy(F, maxntidy)) -maxntidy = 1; - else -specified = true; - if (!getMaxNTIDz(F, maxntidz)) -maxntidz = 1; - else -specified = true; - - if (specified) -O << ".maxntid " << maxntidx << ", " << maxntidy << ", " << maxntidz + unsigned Maxntidx, Maxntidy, Maxntidz; + Maxntidx = Maxntidy = Maxntidz = 1; + bool MaxSpecified = false; + MaxSpecified |= getMaxNTIDx(F, Maxntidx); + MaxSpecified |= getMaxNTIDy(F, Maxntidy); + MaxSpecified |= getMaxNTIDz(F, Maxntidz); + + if (MaxSpecified) +O << ".maxntid " << Maxntidx << ", " << Maxntidy << ", " << Maxntidz << "\n"; - unsigned mincta; - if (getMinCTASm(F, mincta)) -O << ".minnctapersm " << mincta << "\n"; + unsigned Mincta = 0; + if (getMinCTASm(F, Mincta)) +O << ".minnctapersm " << Mincta << "\n"; - unsigned maxnreg; - if (getMaxNReg(F, maxnreg)) -O << ".maxnreg " << maxnreg << "\n"; + unsigned Maxnreg = 0; + if (getMaxNReg(F, Maxnreg)) +O << ".maxnreg " << Maxnreg << "\n"; + + unsigned Maxclusterrank = 0; jchlanda wrote: You are right, `ptxas` reacts to a sample with `.maxclusterrank` with pre SM_90 with a hard error: ``` ptxas --gpu-name sm_75 --output-file cluster_rank.o cluster_rank.s ptxas cluster_rank.s, line 18; error : Feature '.maxclusterrank' requires .target sm_90 or higher ptxas fatal : Ptx assembly aborted due to errors ``` Do I understand you right, that you'd like to see a [check similar to what we do in SemaDeclAttr](https://github.com/llvm/llvm-project/pull/66496/files#diff-2a5bdb2d9f07f8d77de51d5403d349c22978141b6de6bd87fc5e449f5ed95becR5683) and filter out the directive on targets < SM_90? https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda updated https://github.com/llvm/llvm-project/pull/66496 >From 9c8caed3c8def15ccdbfdf831f36d0befed1fc84 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 15 Sep 2023 12:08:04 +0100 Subject: [PATCH 1/5] [NVPTX] Add support for maxclusterrank in launch_bounds Since SM_90 CUDA supports specifying additional argument to the launch_bounds attribute: maxBlocksPerCluster, to express the maximum number of CTAs that can be part of the cluster. See: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank and https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#launch-bounds for details. --- clang/include/clang/Basic/Attr.td | 3 +- .../clang/Basic/DiagnosticSemaKinds.td| 4 + clang/include/clang/Sema/Sema.h | 5 +- clang/lib/CodeGen/Targets/NVPTX.cpp | 12 ++- clang/lib/Parse/ParseOpenMP.cpp | 3 +- clang/lib/Sema/SemaDeclAttr.cpp | 46 +-- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 ++- clang/test/CodeGenCUDA/launch-bounds.cu | 69 clang/test/SemaCUDA/launch_bounds.cu | 4 +- clang/test/SemaCUDA/launch_bounds_sm_90.cu| 45 +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 79 +-- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 + llvm/lib/Target/NVPTX/NVPTXUtilities.h| 1 + 13 files changed, 227 insertions(+), 58 deletions(-) create mode 100644 clang/test/SemaCUDA/launch_bounds_sm_90.cu diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index c95db7e8049d47a..3c51261bd3eb081 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1267,7 +1267,8 @@ def CUDAInvalidTarget : InheritableAttr { def CUDALaunchBounds : InheritableAttr { let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">]; - let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>]; + let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>, + ExprArgument<"MaxBlocks", 1>]; let LangOpts = [CUDA]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; // An AST node is created for this attribute, but is not used by other parts diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 0ac4df8edb242f6..088e3a45c7babba 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11836,6 +11836,10 @@ def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_cuda_maxclusterrank_sm_90 : Warning< + "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "%1 attribute">, InGroup; + def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must " "have a bit size of at least %select{2|1}0">; def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit " diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 47379e00a7445e3..dca7b66da3796d9 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11051,12 +11051,13 @@ class Sema final { /// Create an CUDALaunchBoundsAttr attribute. CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, - Expr *MinBlocks); + Expr *MinBlocks, + Expr *MaxBlocks); /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular /// declaration. void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI, - Expr *MaxThreads, Expr *MinBlocks); + Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks); /// AddModeAttr - Adds a mode attribute to a particular declaration. void AddModeAttr(Decl *D, const AttributeCommonInfo &CI, IdentifierInfo *Name, diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 0d4bbd795648008..64d019a10514d60 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -296,8 +296,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr( NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx", MaxThreads.getExtValue()); - // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was - // not specified in __launch_bounds__ or if the user specified a 0 value, + // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it + // was not specified in __launch_bounds__ or if the user specified
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
@@ -12,7 +12,7 @@ __launch_bounds__(0x1) void TestWayTooBigArg(void); // expected- __launch_bounds__(-128, 7) void TestNegArg1(void); // expected-warning {{'launch_bounds' attribute parameter 0 is negative and will be ignored}} __launch_bounds__(128, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 1 is negative and will be ignored}} jchlanda wrote: Done in 028d270290218f3cc4fb35acc721b0645f2118ea https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
@@ -0,0 +1,45 @@ +// RUN: %clang_cc1 -std=c++11 -fsyntax-only -triple nvptx-unknown-unknown -target-cpu sm_90 -verify %s + +#include "Inputs/cuda.h" + +__launch_bounds__(128, 7) void Test2Args(void); +__launch_bounds__(128) void Test1Arg(void); + +__launch_bounds__(0x) void TestMaxArg(void); +__launch_bounds__(0x1) void TestTooBigArg(void); // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}} +__launch_bounds__(0x1) void TestWayTooBigArg(void); // expected-error {{integer literal is too large to be represented in any integer type}} +__launch_bounds__(1, 1, 0x1) void TestWayTooBigArg(void); // expected-error {{integer literal is too large to be represented in any integer type}} + +__launch_bounds__(-128, 7) void TestNegArg1(void); // expected-warning {{'launch_bounds' attribute parameter 0 is negative and will be ignored}} +__launch_bounds__(128, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 1 is negative and will be ignored}} +__launch_bounds__(128, 1, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 2 is negative and will be ignored}} jchlanda wrote: Done in 028d270290218f3cc4fb35acc721b0645f2118ea https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda updated https://github.com/llvm/llvm-project/pull/66496 >From 9c8caed3c8def15ccdbfdf831f36d0befed1fc84 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 15 Sep 2023 12:08:04 +0100 Subject: [PATCH 1/6] [NVPTX] Add support for maxclusterrank in launch_bounds Since SM_90 CUDA supports specifying additional argument to the launch_bounds attribute: maxBlocksPerCluster, to express the maximum number of CTAs that can be part of the cluster. See: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank and https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#launch-bounds for details. --- clang/include/clang/Basic/Attr.td | 3 +- .../clang/Basic/DiagnosticSemaKinds.td| 4 + clang/include/clang/Sema/Sema.h | 5 +- clang/lib/CodeGen/Targets/NVPTX.cpp | 12 ++- clang/lib/Parse/ParseOpenMP.cpp | 3 +- clang/lib/Sema/SemaDeclAttr.cpp | 46 +-- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 ++- clang/test/CodeGenCUDA/launch-bounds.cu | 69 clang/test/SemaCUDA/launch_bounds.cu | 4 +- clang/test/SemaCUDA/launch_bounds_sm_90.cu| 45 +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 79 +-- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 + llvm/lib/Target/NVPTX/NVPTXUtilities.h| 1 + 13 files changed, 227 insertions(+), 58 deletions(-) create mode 100644 clang/test/SemaCUDA/launch_bounds_sm_90.cu diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index c95db7e8049d47a..3c51261bd3eb081 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1267,7 +1267,8 @@ def CUDAInvalidTarget : InheritableAttr { def CUDALaunchBounds : InheritableAttr { let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">]; - let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>]; + let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>, + ExprArgument<"MaxBlocks", 1>]; let LangOpts = [CUDA]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; // An AST node is created for this attribute, but is not used by other parts diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 0ac4df8edb242f6..088e3a45c7babba 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11836,6 +11836,10 @@ def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_cuda_maxclusterrank_sm_90 : Warning< + "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "%1 attribute">, InGroup; + def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must " "have a bit size of at least %select{2|1}0">; def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit " diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 47379e00a7445e3..dca7b66da3796d9 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11051,12 +11051,13 @@ class Sema final { /// Create an CUDALaunchBoundsAttr attribute. CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, - Expr *MinBlocks); + Expr *MinBlocks, + Expr *MaxBlocks); /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular /// declaration. void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI, - Expr *MaxThreads, Expr *MinBlocks); + Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks); /// AddModeAttr - Adds a mode attribute to a particular declaration. void AddModeAttr(Decl *D, const AttributeCommonInfo &CI, IdentifierInfo *Name, diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 0d4bbd795648008..64d019a10514d60 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -296,8 +296,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr( NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx", MaxThreads.getExtValue()); - // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was - // not specified in __launch_bounds__ or if the user specified a 0 value, + // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it + // was not specified in __launch_bounds__ or if the user specified
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
@@ -5607,6 +5607,21 @@ bool Sema::CheckRegparmAttr(const ParsedAttr &AL, unsigned &numParams) { return false; } +// Helper to get CudaArch. +static CudaArch getCudaArch(const TargetInfo &TI) { jchlanda wrote: Done in: 3c17966b26a613f3b1a117f992d45b751cbff463 https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda updated https://github.com/llvm/llvm-project/pull/66496 >From 9c8caed3c8def15ccdbfdf831f36d0befed1fc84 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 15 Sep 2023 12:08:04 +0100 Subject: [PATCH 1/7] [NVPTX] Add support for maxclusterrank in launch_bounds Since SM_90 CUDA supports specifying additional argument to the launch_bounds attribute: maxBlocksPerCluster, to express the maximum number of CTAs that can be part of the cluster. See: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank and https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#launch-bounds for details. --- clang/include/clang/Basic/Attr.td | 3 +- .../clang/Basic/DiagnosticSemaKinds.td| 4 + clang/include/clang/Sema/Sema.h | 5 +- clang/lib/CodeGen/Targets/NVPTX.cpp | 12 ++- clang/lib/Parse/ParseOpenMP.cpp | 3 +- clang/lib/Sema/SemaDeclAttr.cpp | 46 +-- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 ++- clang/test/CodeGenCUDA/launch-bounds.cu | 69 clang/test/SemaCUDA/launch_bounds.cu | 4 +- clang/test/SemaCUDA/launch_bounds_sm_90.cu| 45 +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 79 +-- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 + llvm/lib/Target/NVPTX/NVPTXUtilities.h| 1 + 13 files changed, 227 insertions(+), 58 deletions(-) create mode 100644 clang/test/SemaCUDA/launch_bounds_sm_90.cu diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index c95db7e8049d47a..3c51261bd3eb081 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1267,7 +1267,8 @@ def CUDAInvalidTarget : InheritableAttr { def CUDALaunchBounds : InheritableAttr { let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">]; - let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>]; + let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>, + ExprArgument<"MaxBlocks", 1>]; let LangOpts = [CUDA]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; // An AST node is created for this attribute, but is not used by other parts diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 0ac4df8edb242f6..088e3a45c7babba 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11836,6 +11836,10 @@ def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_cuda_maxclusterrank_sm_90 : Warning< + "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "%1 attribute">, InGroup; + def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must " "have a bit size of at least %select{2|1}0">; def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit " diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 47379e00a7445e3..dca7b66da3796d9 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11051,12 +11051,13 @@ class Sema final { /// Create an CUDALaunchBoundsAttr attribute. CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, - Expr *MinBlocks); + Expr *MinBlocks, + Expr *MaxBlocks); /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular /// declaration. void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI, - Expr *MaxThreads, Expr *MinBlocks); + Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks); /// AddModeAttr - Adds a mode attribute to a particular declaration. void AddModeAttr(Decl *D, const AttributeCommonInfo &CI, IdentifierInfo *Name, diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 0d4bbd795648008..64d019a10514d60 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -296,8 +296,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr( NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx", MaxThreads.getExtValue()); - // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was - // not specified in __launch_bounds__ or if the user specified a 0 value, + // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it + // was not specified in __launch_bounds__ or if the user specified
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
@@ -537,59 +537,46 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const { // If the NVVM IR has some of reqntid* specified, then output // the reqntid directive, and set the unspecified ones to 1. - // If none of reqntid* is specified, don't output reqntid directive. - unsigned reqntidx, reqntidy, reqntidz; - bool specified = false; - if (!getReqNTIDx(F, reqntidx)) -reqntidx = 1; - else -specified = true; - if (!getReqNTIDy(F, reqntidy)) -reqntidy = 1; - else -specified = true; - if (!getReqNTIDz(F, reqntidz)) -reqntidz = 1; - else -specified = true; - - if (specified) -O << ".reqntid " << reqntidx << ", " << reqntidy << ", " << reqntidz + // If none of Reqntid* is specified, don't output reqntid directive. + unsigned Reqntidx, Reqntidy, Reqntidz; + Reqntidx = Reqntidy = Reqntidz = 1; + bool ReqSpecified = false; + ReqSpecified |= getReqNTIDx(F, Reqntidx); + ReqSpecified |= getReqNTIDy(F, Reqntidy); + ReqSpecified |= getReqNTIDz(F, Reqntidz); + + if (ReqSpecified) +O << ".reqntid " << Reqntidx << ", " << Reqntidy << ", " << Reqntidz << "\n"; // If the NVVM IR has some of maxntid* specified, then output // the maxntid directive, and set the unspecified ones to 1. // If none of maxntid* is specified, don't output maxntid directive. - unsigned maxntidx, maxntidy, maxntidz; - specified = false; - if (!getMaxNTIDx(F, maxntidx)) -maxntidx = 1; - else -specified = true; - if (!getMaxNTIDy(F, maxntidy)) -maxntidy = 1; - else -specified = true; - if (!getMaxNTIDz(F, maxntidz)) -maxntidz = 1; - else -specified = true; - - if (specified) -O << ".maxntid " << maxntidx << ", " << maxntidy << ", " << maxntidz + unsigned Maxntidx, Maxntidy, Maxntidz; + Maxntidx = Maxntidy = Maxntidz = 1; + bool MaxSpecified = false; + MaxSpecified |= getMaxNTIDx(F, Maxntidx); + MaxSpecified |= getMaxNTIDy(F, Maxntidy); + MaxSpecified |= getMaxNTIDz(F, Maxntidz); + + if (MaxSpecified) +O << ".maxntid " << Maxntidx << ", " << Maxntidy << ", " << Maxntidz << "\n"; - unsigned mincta; - if (getMinCTASm(F, mincta)) -O << ".minnctapersm " << mincta << "\n"; + unsigned Mincta = 0; + if (getMinCTASm(F, Mincta)) +O << ".minnctapersm " << Mincta << "\n"; - unsigned maxnreg; - if (getMaxNReg(F, maxnreg)) -O << ".maxnreg " << maxnreg << "\n"; + unsigned Maxnreg = 0; + if (getMaxNReg(F, Maxnreg)) +O << ".maxnreg " << Maxnreg << "\n"; + + unsigned Maxclusterrank = 0; jchlanda wrote: Sure, done in: 261840a8bc58258b0e6ce45dd72e7e918a77c692 https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda updated https://github.com/llvm/llvm-project/pull/66496 >From 437c41f418be8a812229acc3573ebba688832ad5 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 15 Sep 2023 12:08:04 +0100 Subject: [PATCH] [NVPTX] Add support for maxclusterrank in launch_bounds Since SM_90 CUDA supports specifying additional argument to the launch_bounds attribute: maxBlocksPerCluster, to express the maximum number of CTAs that can be part of the cluster. See: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank and https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#launch-bounds for details. --- clang/include/clang/Basic/Attr.td | 3 +- .../clang/Basic/DiagnosticSemaKinds.td| 4 + clang/include/clang/Sema/Sema.h | 5 +- clang/lib/Basic/Targets/NVPTX.h | 2 + clang/lib/CodeGen/Targets/NVPTX.cpp | 12 ++- clang/lib/Parse/ParseOpenMP.cpp | 3 +- clang/lib/Sema/SemaDeclAttr.cpp | 43 --- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 ++- clang/test/CodeGenCUDA/launch-bounds.cu | 69 + clang/test/SemaCUDA/launch_bounds.cu | 7 +- clang/test/SemaCUDA/launch_bounds_sm_90.cu| 57 ++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 77 --- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 + llvm/lib/Target/NVPTX/NVPTXUtilities.h| 1 + llvm/test/CodeGen/NVPTX/maxclusterrank.ll | 26 +++ 15 files changed, 262 insertions(+), 61 deletions(-) create mode 100644 clang/test/SemaCUDA/launch_bounds_sm_90.cu create mode 100644 llvm/test/CodeGen/NVPTX/maxclusterrank.ll diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index dd4d45171db4899..fbc27d166ed9dd1 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1267,7 +1267,8 @@ def CUDAInvalidTarget : InheritableAttr { def CUDALaunchBounds : InheritableAttr { let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">]; - let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>]; + let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>, + ExprArgument<"MaxBlocks", 1>]; let LangOpts = [CUDA]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; // An AST node is created for this attribute, but is not used by other parts diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index f4eb02fd9570c2f..29362df68365350 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11850,6 +11850,10 @@ def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_cuda_maxclusterrank_sm_90 : Warning< + "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "%1 attribute">, InGroup; + def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must " "have a bit size of at least %select{2|1}0">; def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit " diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 712db0a3dd895d5..e13524b5f3b30cf 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11053,12 +11053,13 @@ class Sema final { /// Create an CUDALaunchBoundsAttr attribute. CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, - Expr *MinBlocks); + Expr *MinBlocks, + Expr *MaxBlocks); /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular /// declaration. void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI, - Expr *MaxThreads, Expr *MinBlocks); + Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks); /// AddModeAttr - Adds a mode attribute to a particular declaration. void AddModeAttr(Decl *D, const AttributeCommonInfo &CI, IdentifierInfo *Name, diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h index 6fa0b8df97d7894..20d76b702a9426e 100644 --- a/clang/lib/Basic/Targets/NVPTX.h +++ b/clang/lib/Basic/Targets/NVPTX.h @@ -181,6 +181,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo { bool hasBitIntType() const override { return true; } bool hasBFloat16Type() const override { return true; } + + CudaArch getGPU() const { return GPU; } }; } // namespace targets } // namespace clang diff --git a/clang/lib/CodeGen/Targets/NVPTX.cp
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda closed https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
@@ -10,6 +10,7 @@ // //===--===// +#include "../Basic/Targets/NVPTX.h" jchlanda wrote: @sam-mccall, apologies for introducing the bug and thank you for drawing my attention to it. I've got the fix for the problem: ```diff diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 10d1c910d9cd..3b87300e24bc 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -10,7 +10,6 @@ // //===--===// -#include "../Basic/Targets/NVPTX.h" #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" #include "clang/AST/ASTMutationListener.h" @@ -5612,7 +5611,8 @@ bool Sema::CheckRegparmAttr(const ParsedAttr &AL, unsigned &numParams) { static CudaArch getCudaArch(const TargetInfo &TI) { if (!TI.getTriple().isNVPTX()) llvm_unreachable("getCudaArch is only valid for NVPTX triple"); - return static_cast(&TI)->getGPU(); + auto &TO = TI.getTargetOpts(); + return StringToCudaArch(TO.CPU); } // Checks whether an argument of launch_bounds attribute is ``` Would you be so king and point me to the process for "reverting the revert" and folding the fix into the original patch? https://github.com/llvm/llvm-project/pull/66496 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Reland [NVPTX] Add support for maxclusterrank in launch_bounds (#66496) (PR #67667)
https://github.com/jchlanda created https://github.com/llvm/llvm-project/pull/67667 This reverts commit 0afbcb20fd908f8bf9073697423da097be7db592. >From d8c1372998a74dfbfea921bf049575e5e9c0c5a7 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Thu, 28 Sep 2023 13:30:27 +0100 Subject: [PATCH] Reland [NVPTX] Add support for maxclusterrank in launch_bounds (#66496) This reverts commit 0afbcb20fd908f8bf9073697423da097be7db592. --- clang/include/clang/Basic/Attr.td | 3 +- .../clang/Basic/DiagnosticSemaKinds.td| 4 + clang/include/clang/Sema/Sema.h | 5 +- clang/lib/Basic/Targets/NVPTX.h | 2 + clang/lib/CodeGen/Targets/NVPTX.cpp | 12 ++- clang/lib/Parse/ParseOpenMP.cpp | 3 +- clang/lib/Sema/SemaDeclAttr.cpp | 43 --- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 ++- clang/test/CodeGenCUDA/launch-bounds.cu | 69 + clang/test/SemaCUDA/launch_bounds.cu | 7 +- clang/test/SemaCUDA/launch_bounds_sm_90.cu| 57 ++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 77 --- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 + llvm/lib/Target/NVPTX/NVPTXUtilities.h| 1 + llvm/test/CodeGen/NVPTX/maxclusterrank.ll | 26 +++ 15 files changed, 262 insertions(+), 61 deletions(-) create mode 100644 clang/test/SemaCUDA/launch_bounds_sm_90.cu create mode 100644 llvm/test/CodeGen/NVPTX/maxclusterrank.ll diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index dd4d45171db4899..fbc27d166ed9dd1 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1267,7 +1267,8 @@ def CUDAInvalidTarget : InheritableAttr { def CUDALaunchBounds : InheritableAttr { let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">]; - let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>]; + let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>, + ExprArgument<"MaxBlocks", 1>]; let LangOpts = [CUDA]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; // An AST node is created for this attribute, but is not used by other parts diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index f4eb02fd9570c2f..29362df68365350 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11850,6 +11850,10 @@ def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_cuda_maxclusterrank_sm_90 : Warning< + "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "%1 attribute">, InGroup; + def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must " "have a bit size of at least %select{2|1}0">; def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit " diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 712db0a3dd895d5..e13524b5f3b30cf 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11053,12 +11053,13 @@ class Sema final { /// Create an CUDALaunchBoundsAttr attribute. CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, - Expr *MinBlocks); + Expr *MinBlocks, + Expr *MaxBlocks); /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular /// declaration. void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI, - Expr *MaxThreads, Expr *MinBlocks); + Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks); /// AddModeAttr - Adds a mode attribute to a particular declaration. void AddModeAttr(Decl *D, const AttributeCommonInfo &CI, IdentifierInfo *Name, diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h index 6fa0b8df97d7894..20d76b702a9426e 100644 --- a/clang/lib/Basic/Targets/NVPTX.h +++ b/clang/lib/Basic/Targets/NVPTX.h @@ -181,6 +181,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo { bool hasBitIntType() const override { return true; } bool hasBFloat16Type() const override { return true; } + + CudaArch getGPU() const { return GPU; } }; } // namespace targets } // namespace clang diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 0d4bbd795648008..64d019a10514d60 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -296,8 +296,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr( NVPTXT
[clang] Reland [NVPTX] Add support for maxclusterrank in launch_bounds (#66496) (PR #67667)
jchlanda wrote: As discussed in: https://github.com/llvm/llvm-project/pull/66496#discussion_r134239 https://github.com/llvm/llvm-project/pull/67667 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Reland [NVPTX] Add support for maxclusterrank in launch_bounds (#66496) (PR #67667)
https://github.com/jchlanda updated https://github.com/llvm/llvm-project/pull/67667 >From 6d17781780584d5bc123e93e1388e64df0bbd3f9 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Thu, 28 Sep 2023 13:30:27 +0100 Subject: [PATCH] Reland [NVPTX] Add support for maxclusterrank in launch_bounds (#66496) This reverts commit 0afbcb20fd908f8bf9073697423da097be7db592. --- clang/include/clang/Basic/Attr.td | 3 +- .../clang/Basic/DiagnosticSemaKinds.td| 4 + clang/include/clang/Sema/Sema.h | 5 +- clang/lib/Basic/Targets/NVPTX.h | 2 + clang/lib/CodeGen/Targets/NVPTX.cpp | 12 ++- clang/lib/Parse/ParseOpenMP.cpp | 3 +- clang/lib/Sema/SemaDeclAttr.cpp | 43 --- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 ++- clang/test/CodeGenCUDA/launch-bounds.cu | 69 + clang/test/SemaCUDA/launch_bounds.cu | 7 +- clang/test/SemaCUDA/launch_bounds_sm_90.cu| 57 ++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 77 --- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 + llvm/lib/Target/NVPTX/NVPTXUtilities.h| 1 + llvm/test/CodeGen/NVPTX/maxclusterrank.ll | 26 +++ 15 files changed, 262 insertions(+), 61 deletions(-) create mode 100644 clang/test/SemaCUDA/launch_bounds_sm_90.cu create mode 100644 llvm/test/CodeGen/NVPTX/maxclusterrank.ll diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index dd4d45171db4899..fbc27d166ed9dd1 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1267,7 +1267,8 @@ def CUDAInvalidTarget : InheritableAttr { def CUDALaunchBounds : InheritableAttr { let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">]; - let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>]; + let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>, + ExprArgument<"MaxBlocks", 1>]; let LangOpts = [CUDA]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; // An AST node is created for this attribute, but is not used by other parts diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index f4eb02fd9570c2f..29362df68365350 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11850,6 +11850,10 @@ def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_cuda_maxclusterrank_sm_90 : Warning< + "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "%1 attribute">, InGroup; + def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must " "have a bit size of at least %select{2|1}0">; def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit " diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 712db0a3dd895d5..e13524b5f3b30cf 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11053,12 +11053,13 @@ class Sema final { /// Create an CUDALaunchBoundsAttr attribute. CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, - Expr *MinBlocks); + Expr *MinBlocks, + Expr *MaxBlocks); /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular /// declaration. void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI, - Expr *MaxThreads, Expr *MinBlocks); + Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks); /// AddModeAttr - Adds a mode attribute to a particular declaration. void AddModeAttr(Decl *D, const AttributeCommonInfo &CI, IdentifierInfo *Name, diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h index 6fa0b8df97d7894..20d76b702a9426e 100644 --- a/clang/lib/Basic/Targets/NVPTX.h +++ b/clang/lib/Basic/Targets/NVPTX.h @@ -181,6 +181,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo { bool hasBitIntType() const override { return true; } bool hasBFloat16Type() const override { return true; } + + CudaArch getGPU() const { return GPU; } }; } // namespace targets } // namespace clang diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 0d4bbd795648008..64d019a10514d60 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -296,8 +296,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr( NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
[clang] Reland [NVPTX] Add support for maxclusterrank in launch_bounds (#66496) (PR #67667)
https://github.com/jchlanda closed https://github.com/llvm/llvm-project/pull/67667 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[libclc] [LIBCLC] Teach prepare-builtins how to handle text based IR (PR #66993)
https://github.com/jchlanda closed https://github.com/llvm/llvm-project/pull/66993 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)
https://github.com/jchlanda created https://github.com/llvm/llvm-project/pull/66496 Since SM_90 CUDA supports specifying additional argument to the launch_bounds attribute: maxBlocksPerCluster, to express the maximum number of CTAs that can be part of the cluster. See: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank and https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#launch-bounds for details. >From 9c8caed3c8def15ccdbfdf831f36d0befed1fc84 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 15 Sep 2023 12:08:04 +0100 Subject: [PATCH] [NVPTX] Add support for maxclusterrank in launch_bounds Since SM_90 CUDA supports specifying additional argument to the launch_bounds attribute: maxBlocksPerCluster, to express the maximum number of CTAs that can be part of the cluster. See: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank and https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#launch-bounds for details. --- clang/include/clang/Basic/Attr.td | 3 +- .../clang/Basic/DiagnosticSemaKinds.td| 4 + clang/include/clang/Sema/Sema.h | 5 +- clang/lib/CodeGen/Targets/NVPTX.cpp | 12 ++- clang/lib/Parse/ParseOpenMP.cpp | 3 +- clang/lib/Sema/SemaDeclAttr.cpp | 46 +-- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 ++- clang/test/CodeGenCUDA/launch-bounds.cu | 69 clang/test/SemaCUDA/launch_bounds.cu | 4 +- clang/test/SemaCUDA/launch_bounds_sm_90.cu| 45 +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 79 +-- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 + llvm/lib/Target/NVPTX/NVPTXUtilities.h| 1 + 13 files changed, 227 insertions(+), 58 deletions(-) create mode 100644 clang/test/SemaCUDA/launch_bounds_sm_90.cu diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index c95db7e8049d47a..3c51261bd3eb081 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1267,7 +1267,8 @@ def CUDAInvalidTarget : InheritableAttr { def CUDALaunchBounds : InheritableAttr { let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">]; - let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>]; + let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>, + ExprArgument<"MaxBlocks", 1>]; let LangOpts = [CUDA]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; // An AST node is created for this attribute, but is not used by other parts diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 0ac4df8edb242f6..088e3a45c7babba 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11836,6 +11836,10 @@ def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_cuda_maxclusterrank_sm_90 : Warning< + "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "%1 attribute">, InGroup; + def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must " "have a bit size of at least %select{2|1}0">; def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit " diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 47379e00a7445e3..dca7b66da3796d9 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11051,12 +11051,13 @@ class Sema final { /// Create an CUDALaunchBoundsAttr attribute. CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, - Expr *MinBlocks); + Expr *MinBlocks, + Expr *MaxBlocks); /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular /// declaration. void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI, - Expr *MaxThreads, Expr *MinBlocks); + Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks); /// AddModeAttr - Adds a mode attribute to a particular declaration. void AddModeAttr(Decl *D, const AttributeCommonInfo &CI, IdentifierInfo *Name, diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 0d4bbd795648008..64d019a10514d60 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -296,8 +296,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr( NVPTXTargetCodeGenInfo::addNVVMMetadata
[clang] 7258317 - [NVPTX] Expose LDU builtins
Author: Jakub Chlanda Date: 2023-03-15T08:41:45Z New Revision: 7258317bade0fd82e257e47b31eee3ad0c6c5305 URL: https://github.com/llvm/llvm-project/commit/7258317bade0fd82e257e47b31eee3ad0c6c5305 DIFF: https://github.com/llvm/llvm-project/commit/7258317bade0fd82e257e47b31eee3ad0c6c5305.diff LOG: [NVPTX] Expose LDU builtins Also check if native half types are supported to give more descriptive error message, without it clang only reports incorrect intrinsic return type. Differential Revision: https://reviews.llvm.org/D145238 Added: clang/test/CodeGen/builtins-nvptx-native-half-type-err.c Modified: clang/include/clang/Basic/BuiltinsNVPTX.def clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/builtins-nvptx-native-half-type.c clang/test/CodeGen/builtins-nvptx.c llvm/test/CodeGen/NVPTX/ldu-ldg.ll Removed: diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 7fcd906c599b8..96531def77a78 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -782,7 +782,43 @@ TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_ll, "LLiLLiD*LLiLLi", "n", SM_60) BUILTIN(__nvvm_compiler_error, "vcC*4", "n") BUILTIN(__nvvm_compiler_warn, "vcC*4", "n") -// __ldg. This is not implemented as a builtin by nvcc. +BUILTIN(__nvvm_ldu_c, "ccC*", "") +BUILTIN(__nvvm_ldu_s, "ssC*", "") +BUILTIN(__nvvm_ldu_i, "iiC*", "") +BUILTIN(__nvvm_ldu_l, "LiLiC*", "") +BUILTIN(__nvvm_ldu_ll, "LLiLLiC*", "") + +BUILTIN(__nvvm_ldu_uc, "UcUcC*", "") +BUILTIN(__nvvm_ldu_us, "UsUsC*", "") +BUILTIN(__nvvm_ldu_ui, "UiUiC*", "") +BUILTIN(__nvvm_ldu_ul, "ULiULiC*", "") +BUILTIN(__nvvm_ldu_ull, "ULLiULLiC*", "") + +BUILTIN(__nvvm_ldu_h, "hhC*", "") +BUILTIN(__nvvm_ldu_f, "ffC*", "") +BUILTIN(__nvvm_ldu_d, "ddC*", "") + +BUILTIN(__nvvm_ldu_c2, "E2cE2cC*", "") +BUILTIN(__nvvm_ldu_c4, "E4cE4cC*", "") +BUILTIN(__nvvm_ldu_s2, "E2sE2sC*", "") +BUILTIN(__nvvm_ldu_s4, "E4sE4sC*", "") +BUILTIN(__nvvm_ldu_i2, "E2iE2iC*", "") +BUILTIN(__nvvm_ldu_i4, "E4iE4iC*", "") +BUILTIN(__nvvm_ldu_ll2, "E2LLiE2LLiC*", "") + +BUILTIN(__nvvm_ldu_uc2, "E2UcE2UcC*", "") +BUILTIN(__nvvm_ldu_uc4, "E4UcE4UcC*", "") +BUILTIN(__nvvm_ldu_us2, "E2UsE2UsC*", "") +BUILTIN(__nvvm_ldu_us4, "E4UsE4UsC*", "") +BUILTIN(__nvvm_ldu_ui2, "E2UiE2UiC*", "") +BUILTIN(__nvvm_ldu_ui4, "E4UiE4UiC*", "") +BUILTIN(__nvvm_ldu_ull2, "E2ULLiE2ULLiC*", "") + +BUILTIN(__nvvm_ldu_h2, "E2hE2hC*", "") +BUILTIN(__nvvm_ldu_f2, "E2fE2fC*", "") +BUILTIN(__nvvm_ldu_f4, "E4fE4fC*", "") +BUILTIN(__nvvm_ldu_d2, "E2dE2dC*", "") + BUILTIN(__nvvm_ldg_c, "ccC*", "") BUILTIN(__nvvm_ldg_s, "ssC*", "") BUILTIN(__nvvm_ldg_i, "iiC*", "") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 9424f0f95f7f4..fa8703b1e5202 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18130,7 +18130,12 @@ static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) { Value * CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { - auto MakeLdg = [&](unsigned IntrinsicID) { + auto HasHalfSupport = [&](unsigned BuiltinID) { +auto &Context = getContext(); +return Context.getLangOpts().NativeHalfType || + !Context.getTargetInfo().useFP16ConversionIntrinsics(); + }; + auto MakeLdgLdu = [&](unsigned IntrinsicID) { Value *Ptr = EmitScalarExpr(E->getArg(0)); QualType ArgType = E->getArg(0)->getType(); clang::CharUnits Align = CGM.getNaturalPointeeTypeAlignment(ArgType); @@ -18256,15 +18261,63 @@ CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { // PTX Interoperability section 2.2: "For a vector with an even number of // elements, its alignment is set to number of elements times the alignment // of its member: n*alignof(t)." -return MakeLdg(Intrinsic::nvvm_ldg_global_i); +return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i); case NVPTX::BI__nvvm_ldg_h: - case NVPTX::BI__nvvm_ldg_f: case NVPTX::BI__nvvm_ldg_h2: +if (!HasHalfSupport(BuiltinID)) { + CGM.Error(E->getExprLoc(), +getContext().BuiltinInfo.getName(BuiltinID).str() + +" requires native half type support."); + return nullptr; +} +[[fallthrough]]; + case NVPTX::BI__nvvm_ldg_f: case NVPTX::BI__nvvm_ldg_f2: case NVPTX::BI__nvvm_ldg_f4: case NVPTX::BI__nvvm_ldg_d: case NVPTX::BI__nvvm_ldg_d2: -return MakeLdg(Intrinsic::nvvm_ldg_global_f); +return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f); + + case NVPTX::BI__nvvm_ldu_c: + case NVPTX::BI__nvvm_ldu_c2: + case NVPTX::BI__nvvm_ldu_c4: + case NVPTX::BI__nvvm_ldu_s: + case NVPTX::BI__nvvm_ldu_s2: + case NVPTX::BI__nvvm_ldu_s4: + case NVPTX::BI__nvvm_ldu_i: + case NVPTX::BI__nvvm_ldu_i2: + case NVPTX::BI__nvvm_ldu_i4: + case NVPTX::BI__nvvm_ldu_l: + case NVPTX::BI__nvvm_ld
[clang] ae3c981 - [NVPTX] Enforce half type support is present for builtins
Author: Jakub Chlanda Date: 2023-03-28T08:48:10+02:00 New Revision: ae3c981aa4b85cfae6531ba50df7ad84feebe43c URL: https://github.com/llvm/llvm-project/commit/ae3c981aa4b85cfae6531ba50df7ad84feebe43c DIFF: https://github.com/llvm/llvm-project/commit/ae3c981aa4b85cfae6531ba50df7ad84feebe43c.diff LOG: [NVPTX] Enforce half type support is present for builtins Differential Revision: https://reviews.llvm.org/D146715 Added: Modified: clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/builtins-nvptx-native-half-type-err.c llvm/include/llvm/IR/IntrinsicsNVVM.td Removed: diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index c8112b0ea0ec0..f399b0770143a 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18162,32 +18162,63 @@ static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) { #undef MMA_VARIANTS_B1_XOR } +static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF, + const CallExpr *E) { + Value *Ptr = CGF.EmitScalarExpr(E->getArg(0)); + QualType ArgType = E->getArg(0)->getType(); + clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType); + llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType()); + return CGF.Builder.CreateCall( + CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}), + {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())}); +} + +static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF, + const CallExpr *E) { + Value *Ptr = CGF.EmitScalarExpr(E->getArg(0)); + llvm::Type *ElemTy = + CGF.ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType()); + return CGF.Builder.CreateCall( + CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}), + {Ptr, CGF.EmitScalarExpr(E->getArg(1))}); +} + +static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, + const CallExpr *E, CodeGenFunction &CGF) { + auto &C = CGF.CGM.getContext(); + if (!(C.getLangOpts().NativeHalfType || +!C.getTargetInfo().useFP16ConversionIntrinsics())) { +CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getName(BuiltinID).str() + + " requires native half type support."); +return nullptr; + } + + if (IntrinsicID == Intrinsic::nvvm_ldg_global_f || + IntrinsicID == Intrinsic::nvvm_ldu_global_f) +return MakeLdgLdu(IntrinsicID, CGF, E); + + SmallVector Args; + auto *F = CGF.CGM.getIntrinsic(IntrinsicID); + auto *FTy = F->getFunctionType(); + unsigned ICEArguments = 0; + ASTContext::GetBuiltinTypeError Error; + C.GetBuiltinType(BuiltinID, Error, &ICEArguments); + assert(Error == ASTContext::GE_None && "Should not codegen an error"); + for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) { +assert((ICEArguments & (1 << i)) == 0); +auto *ArgValue = CGF.EmitScalarExpr(E->getArg(i)); +auto *PTy = FTy->getParamType(i); +if (PTy != ArgValue->getType()) + ArgValue = CGF.Builder.CreateBitCast(ArgValue, PTy); +Args.push_back(ArgValue); + } + + return CGF.Builder.CreateCall(F, Args); +} } // namespace -Value * -CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { - auto HasHalfSupport = [&](unsigned BuiltinID) { -auto &Context = getContext(); -return Context.getLangOpts().NativeHalfType || - !Context.getTargetInfo().useFP16ConversionIntrinsics(); - }; - auto MakeLdgLdu = [&](unsigned IntrinsicID) { -Value *Ptr = EmitScalarExpr(E->getArg(0)); -QualType ArgType = E->getArg(0)->getType(); -clang::CharUnits Align = CGM.getNaturalPointeeTypeAlignment(ArgType); -llvm::Type *ElemTy = ConvertTypeForMem(ArgType->getPointeeType()); -return Builder.CreateCall( -CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}), -{Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())}); - }; - auto MakeScopedAtomic = [&](unsigned IntrinsicID) { -Value *Ptr = EmitScalarExpr(E->getArg(0)); -llvm::Type *ElemTy = -ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType()); -return Builder.CreateCall( -CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}), -{Ptr, EmitScalarExpr(E->getArg(1))}); - }; +Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, + const CallExpr *E) { switch (BuiltinID) { case NVPTX::BI__nvvm_atom_add_gen_i: case NVPTX::BI__nvvm_atom_add_gen_l: @@ -18297,22 +18328,13 @@ CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { // PTX Interoperability section 2.2: "For a vector with an even number of // elements, its alignment is set to number of elements times the alignment // of its member: n*alignof(t)." -return MakeLdgLdu
[clang] 71b0658 - [NVPTX] Add f16 and v2f16 ldg builtins
Author: Jakub Chlanda Date: 2023-03-03T12:49:18+01:00 New Revision: 71b06585857a77691761a7bfd16b5b91454a6894 URL: https://github.com/llvm/llvm-project/commit/71b06585857a77691761a7bfd16b5b91454a6894 DIFF: https://github.com/llvm/llvm-project/commit/71b06585857a77691761a7bfd16b5b91454a6894.diff LOG: [NVPTX] Add f16 and v2f16 ldg builtins Adds f16 and v2f16 ldg builtins and relevant tests. Differential Revision: https://reviews.llvm.org/D144961 Added: Modified: clang/include/clang/Basic/BuiltinsNVPTX.def clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/builtins-nvptx-native-half-type.c llvm/test/CodeGen/NVPTX/ldu-ldg.ll Removed: diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index ea0cd8c3e8431..7fcd906c599b8 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -795,6 +795,7 @@ BUILTIN(__nvvm_ldg_ui, "UiUiC*", "") BUILTIN(__nvvm_ldg_ul, "ULiULiC*", "") BUILTIN(__nvvm_ldg_ull, "ULLiULLiC*", "") +BUILTIN(__nvvm_ldg_h, "hhC*", "") BUILTIN(__nvvm_ldg_f, "ffC*", "") BUILTIN(__nvvm_ldg_d, "ddC*", "") @@ -814,6 +815,7 @@ BUILTIN(__nvvm_ldg_ui2, "E2UiE2UiC*", "") BUILTIN(__nvvm_ldg_ui4, "E4UiE4UiC*", "") BUILTIN(__nvvm_ldg_ull2, "E2ULLiE2ULLiC*", "") +BUILTIN(__nvvm_ldg_h2, "E2hE2hC*", "") BUILTIN(__nvvm_ldg_f2, "E2fE2fC*", "") BUILTIN(__nvvm_ldg_f4, "E4fE4fC*", "") BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 1535b14c7fb40..07a39bca9d7a2 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18228,7 +18228,9 @@ CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { // elements, its alignment is set to number of elements times the alignment // of its member: n*alignof(t)." return MakeLdg(Intrinsic::nvvm_ldg_global_i); + case NVPTX::BI__nvvm_ldg_h: case NVPTX::BI__nvvm_ldg_f: + case NVPTX::BI__nvvm_ldg_h2: case NVPTX::BI__nvvm_ldg_f2: case NVPTX::BI__nvvm_ldg_f4: case NVPTX::BI__nvvm_ldg_d: diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c index 95021f274cd0f..9dc61d6014210 100644 --- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c +++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c @@ -172,3 +172,12 @@ __device__ void nvvm_min_max_sm86() { #endif // CHECK: ret void } + +// CHECK-LABEL: nvvm_ldg_native_half_types +__device__ void nvvm_ldg_native_half_types(const void *p) { + // CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0 + __nvvm_ldg_h((const __fp16 *)p); + typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2))); + // CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0 + __nvvm_ldg_h2((const __fp16v2 *)p); +} diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll index 6d5fcb4cd317e..d40eb7a32027d 100644 --- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll +++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll @@ -4,34 +4,82 @@ declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align) declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align) + declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align) +declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align) declare i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align) +declare i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align) +declare float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align) +declare double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align) +declare half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align) +declare <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align) - -; CHECK: func0 -define i8 @func0(ptr addrspace(1) %ptr) { +; CHECK: test_ldu_i8 +define i8 @test_ldu_i8(ptr addrspace(1) %ptr) { ; ldu.global.u8 %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4) ret i8 %val } -; CHECK: func1 -define i32 @func1(ptr addrspace(1) %ptr) { +; CHECK: test_ldu_i32 +define i32 @test_ldu_i32(ptr addrspace(1) %ptr) { ; ldu.global.u32 %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4) ret i32 %val } -; CHECK: func2 -define i8 @func2(ptr addrspace(1) %ptr) { +; CHECK: test_ldg_i8 +define i8 @test_ldg_i8(ptr addrspace(1) %ptr) { ; ld.global.nc.u8 %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4) ret i8 %val } -; CHECK: func3 -define i32 @func3(ptr addrspace(1) %ptr) { +; CHECK: test_ldg_i16 +define i16 @test_ldg_i16(ptr addrspace(1) %ptr) { +; ld.global.nc.u16 + %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace
[clang] 3e37c98 - [cuda, NVPTX] Signed char and (unsigned)long builtins of ldg and ldu
Author: Jakub Chlanda Date: 2023-06-02T09:10:19+02:00 New Revision: 3e37c98bdb512425cab91f6cf156cc66d6103b2f URL: https://github.com/llvm/llvm-project/commit/3e37c98bdb512425cab91f6cf156cc66d6103b2f DIFF: https://github.com/llvm/llvm-project/commit/3e37c98bdb512425cab91f6cf156cc66d6103b2f.diff LOG: [cuda, NVPTX] Signed char and (unsigned)long builtins of ldg and ldu Differential Revision: https://reviews.llvm.org/D151876 Added: Modified: clang/include/clang/Basic/BuiltinsNVPTX.def clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/builtins-nvptx.c Removed: diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 7ffb38d50a6cf..3275d50a85a4b 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -817,6 +817,7 @@ BUILTIN(__nvvm_compiler_error, "vcC*4", "n") BUILTIN(__nvvm_compiler_warn, "vcC*4", "n") BUILTIN(__nvvm_ldu_c, "ccC*", "") +BUILTIN(__nvvm_ldu_sc, "ScScC*", "") BUILTIN(__nvvm_ldu_s, "ssC*", "") BUILTIN(__nvvm_ldu_i, "iiC*", "") BUILTIN(__nvvm_ldu_l, "LiLiC*", "") @@ -833,11 +834,14 @@ BUILTIN(__nvvm_ldu_f, "ffC*", "") BUILTIN(__nvvm_ldu_d, "ddC*", "") BUILTIN(__nvvm_ldu_c2, "E2cE2cC*", "") +BUILTIN(__nvvm_ldu_sc2, "E2ScE2ScC*", "") BUILTIN(__nvvm_ldu_c4, "E4cE4cC*", "") +BUILTIN(__nvvm_ldu_sc4, "E4ScE4ScC*", "") BUILTIN(__nvvm_ldu_s2, "E2sE2sC*", "") BUILTIN(__nvvm_ldu_s4, "E4sE4sC*", "") BUILTIN(__nvvm_ldu_i2, "E2iE2iC*", "") BUILTIN(__nvvm_ldu_i4, "E4iE4iC*", "") +BUILTIN(__nvvm_ldu_l2, "E2LiE2LiC*", "") BUILTIN(__nvvm_ldu_ll2, "E2LLiE2LLiC*", "") BUILTIN(__nvvm_ldu_uc2, "E2UcE2UcC*", "") @@ -846,6 +850,7 @@ BUILTIN(__nvvm_ldu_us2, "E2UsE2UsC*", "") BUILTIN(__nvvm_ldu_us4, "E4UsE4UsC*", "") BUILTIN(__nvvm_ldu_ui2, "E2UiE2UiC*", "") BUILTIN(__nvvm_ldu_ui4, "E4UiE4UiC*", "") +BUILTIN(__nvvm_ldu_ul2, "E2ULiE2ULiC*", "") BUILTIN(__nvvm_ldu_ull2, "E2ULLiE2ULLiC*", "") BUILTIN(__nvvm_ldu_h2, "E2hE2hC*", "") @@ -854,6 +859,7 @@ BUILTIN(__nvvm_ldu_f4, "E4fE4fC*", "") BUILTIN(__nvvm_ldu_d2, "E2dE2dC*", "") BUILTIN(__nvvm_ldg_c, "ccC*", "") +BUILTIN(__nvvm_ldg_sc, "ScScC*", "") BUILTIN(__nvvm_ldg_s, "ssC*", "") BUILTIN(__nvvm_ldg_i, "iiC*", "") BUILTIN(__nvvm_ldg_l, "LiLiC*", "") @@ -870,11 +876,14 @@ BUILTIN(__nvvm_ldg_f, "ffC*", "") BUILTIN(__nvvm_ldg_d, "ddC*", "") BUILTIN(__nvvm_ldg_c2, "E2cE2cC*", "") +BUILTIN(__nvvm_ldg_sc2, "E2ScE2ScC*", "") BUILTIN(__nvvm_ldg_c4, "E4cE4cC*", "") +BUILTIN(__nvvm_ldg_sc4, "E4ScE4ScC*", "") BUILTIN(__nvvm_ldg_s2, "E2sE2sC*", "") BUILTIN(__nvvm_ldg_s4, "E4sE4sC*", "") BUILTIN(__nvvm_ldg_i2, "E2iE2iC*", "") BUILTIN(__nvvm_ldg_i4, "E4iE4iC*", "") +BUILTIN(__nvvm_ldg_l2, "E2LiE2LiC*", "") BUILTIN(__nvvm_ldg_ll2, "E2LLiE2LLiC*", "") BUILTIN(__nvvm_ldg_uc2, "E2UcE2UcC*", "") @@ -883,6 +892,7 @@ BUILTIN(__nvvm_ldg_us2, "E2UsE2UsC*", "") BUILTIN(__nvvm_ldg_us4, "E4UsE4UsC*", "") BUILTIN(__nvvm_ldg_ui2, "E2UiE2UiC*", "") BUILTIN(__nvvm_ldg_ui4, "E4UiE4UiC*", "") +BUILTIN(__nvvm_ldg_ul2, "E2ULiE2ULiC*", "") BUILTIN(__nvvm_ldg_ull2, "E2ULLiE2ULLiC*", "") BUILTIN(__nvvm_ldg_h2, "E2hE2hC*", "") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 3a61fdd65592a..bfa6fd716c5ec 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18422,8 +18422,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, } case NVPTX::BI__nvvm_ldg_c: + case NVPTX::BI__nvvm_ldg_sc: case NVPTX::BI__nvvm_ldg_c2: + case NVPTX::BI__nvvm_ldg_sc2: case NVPTX::BI__nvvm_ldg_c4: + case NVPTX::BI__nvvm_ldg_sc4: case NVPTX::BI__nvvm_ldg_s: case NVPTX::BI__nvvm_ldg_s2: case NVPTX::BI__nvvm_ldg_s4: @@ -18431,6 +18434,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_ldg_i2: case NVPTX::BI__nvvm_ldg_i4: case NVPTX::BI__nvvm_ldg_l: + case NVPTX::BI__nvvm_ldg_l2: case NVPTX::BI__nvvm_ldg_ll: case NVPTX::BI__nvvm_ldg_ll2: case NVPTX::BI__nvvm_ldg_uc: @@ -18443,6 +18447,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_ldg_ui2: case NVPTX::BI__nvvm_ldg_ui4: case NVPTX::BI__nvvm_ldg_ul: + case NVPTX::BI__nvvm_ldg_ul2: case NVPTX::BI__nvvm_ldg_ull: case NVPTX::BI__nvvm_ldg_ull2: // PTX Interoperability section 2.2: "For a vector with an even number of @@ -18457,8 +18462,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, *this, E); case NVPTX::BI__nvvm_ldu_c: + case NVPTX::BI__nvvm_ldu_sc: case NVPTX::BI__nvvm_ldu_c2: + case NVPTX::BI__nvvm_ldu_sc2: case NVPTX::BI__nvvm_ldu_c4: + case NVPTX::BI__nvvm_ldu_sc4: case NVPTX::BI__nvvm_ldu_s: case NVPTX::BI__nvvm_ldu_s2: case NVPTX::BI__nvvm_ldu_s4: @@ -18466,6 +18474,7 @@ Value *Cod
[clang] [CUDA][NFC] CudaArch to OffloadArch rename (PR #97028)
https://github.com/jchlanda updated https://github.com/llvm/llvm-project/pull/97028 >From 74eb15f035e4aed1dd19b735ac0b2fc5ad172213 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 28 Jun 2024 09:25:56 + Subject: [PATCH] [CUDA][NFC] CudaArch to OffloadArch rename Rename CudaArch to OffloadArch to better reflect its content and the use. Apply a similar rename to helpers handling the enum. --- clang/include/clang/Basic/Cuda.h | 28 +-- clang/lib/Basic/Cuda.cpp | 110 ++-- clang/lib/Basic/Targets/NVPTX.cpp| 160 - clang/lib/Basic/Targets/NVPTX.h | 20 +-- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 173 +-- clang/lib/Driver/Driver.cpp | 43 ++--- clang/lib/Driver/OffloadBundler.cpp | 3 +- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 2 +- clang/lib/Driver/ToolChains/Cuda.cpp | 38 ++-- clang/lib/Driver/ToolChains/Cuda.h | 4 +- clang/lib/Sema/SemaDeclAttr.cpp | 14 +- 11 files changed, 297 insertions(+), 298 deletions(-) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 01cfe286c491b..83699f8897f66 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -52,7 +52,7 @@ const char *CudaVersionToString(CudaVersion V); // Input is "Major.Minor" CudaVersion CudaStringToVersion(const llvm::Twine &S); -enum class CudaArch { +enum class OffloadArch { UNUSED, UNKNOWN, // TODO: Deprecate and remove GPU architectures older than sm_52. @@ -133,8 +133,8 @@ enum class CudaArch { // public one. LAST, - CudaDefault = CudaArch::SM_52, - HIPDefault = CudaArch::GFX906, + CudaDefault = OffloadArch::SM_52, + HIPDefault = OffloadArch::GFX906, }; enum class CUDAFunctionTarget { @@ -145,26 +145,26 @@ enum class CUDAFunctionTarget { InvalidTarget }; -static inline bool IsNVIDIAGpuArch(CudaArch A) { - return A >= CudaArch::SM_20 && A < CudaArch::GFX600; +static inline bool IsNVIDIAOffloadArch(OffloadArch A) { + return A >= OffloadArch::SM_20 && A < OffloadArch::GFX600; } -static inline bool IsAMDGpuArch(CudaArch A) { +static inline bool IsAMDOffloadArch(OffloadArch A) { // Generic processor model is for testing only. - return A >= CudaArch::GFX600 && A < CudaArch::Generic; + return A >= OffloadArch::GFX600 && A < OffloadArch::Generic; } -const char *CudaArchToString(CudaArch A); -const char *CudaArchToVirtualArchString(CudaArch A); +const char *OffloadArchToString(OffloadArch A); +const char *OffloadArchToVirtualArchString(OffloadArch A); // The input should have the form "sm_20". -CudaArch StringToCudaArch(llvm::StringRef S); +OffloadArch StringToOffloadArch(llvm::StringRef S); -/// Get the earliest CudaVersion that supports the given CudaArch. -CudaVersion MinVersionForCudaArch(CudaArch A); +/// Get the earliest CudaVersion that supports the given OffloadArch. +CudaVersion MinVersionForOffloadArch(OffloadArch A); -/// Get the latest CudaVersion that supports the given CudaArch. -CudaVersion MaxVersionForCudaArch(CudaArch A); +/// Get the latest CudaVersion that supports the given OffloadArch. +CudaVersion MaxVersionForOffloadArch(OffloadArch A); // Various SDK-dependent features that affect CUDA compilation enum class CudaFeature { diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index af99c4d61021e..faf3878f064d2 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -72,23 +72,21 @@ CudaVersion ToCudaVersion(llvm::VersionTuple Version) { } namespace { -struct CudaArchToStringMap { - CudaArch arch; +struct OffloadArchToStringMap { + OffloadArch arch; const char *arch_name; const char *virtual_arch_name; }; } // namespace -#define SM2(sm, ca) \ - { CudaArch::SM_##sm, "sm_" #sm, ca } +#define SM2(sm, ca) {OffloadArch::SM_##sm, "sm_" #sm, ca} #define SM(sm) SM2(sm, "compute_" #sm) -#define GFX(gpu) \ - { CudaArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" } -static const CudaArchToStringMap arch_names[] = { +#define GFX(gpu) {OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"} +static const OffloadArchToStringMap arch_names[] = { // clang-format off -{CudaArch::UNUSED, "", ""}, +{OffloadArch::UNUSED, "", ""}, SM2(20, "compute_20"), SM2(21, "compute_20"), // Fermi -SM(30), {CudaArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37), // Kepler +SM(30), {OffloadArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37), // Kepler SM(50), SM(52), SM(53), // Maxwell SM(60), SM(61), SM(62), // Pascal SM(70), SM(72), // Volta @@ -112,7 +110,7 @@ static const CudaArchToStringMap arch_names[] = { GFX(803), // gfx803 GFX(805), // gfx805 GFX(810), // gfx810 -{Cuda
[clang] [CUDA][NFC] CudaArch to OffloadArch rename (PR #97028)
https://github.com/jchlanda closed https://github.com/llvm/llvm-project/pull/97028 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][NFC] CudaArch to GpuArch rename (PR #97028)
https://github.com/jchlanda created https://github.com/llvm/llvm-project/pull/97028 Rename `CudaArch` to `GpuArch` to better reflect its content and the use. Apply a similar rename to helpers handling the enum. >From 24603da293e82cdf2531283fb5f354c805ead1d6 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 28 Jun 2024 09:25:56 + Subject: [PATCH] [CUDA][NFC] CudaArch to GpuArch rename Rename CudaArch to GpuArch to better reflect its content and the use. Apply a similar rename to helpers handling the enum. --- clang/include/clang/Basic/Cuda.h | 28 +-- clang/lib/Basic/Cuda.cpp | 106 ++-- clang/lib/Basic/Targets/NVPTX.cpp| 160 - clang/lib/Basic/Targets/NVPTX.h | 20 +-- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 173 +-- clang/lib/Driver/Driver.cpp | 38 ++-- clang/lib/Driver/OffloadBundler.cpp | 2 +- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 2 +- clang/lib/Driver/ToolChains/Cuda.cpp | 32 ++-- clang/lib/Driver/ToolChains/Cuda.h | 4 +- clang/lib/Sema/SemaDeclAttr.cpp | 14 +- 11 files changed, 288 insertions(+), 291 deletions(-) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 01cfe286c491b..b0999d2c5d1ac 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -52,7 +52,7 @@ const char *CudaVersionToString(CudaVersion V); // Input is "Major.Minor" CudaVersion CudaStringToVersion(const llvm::Twine &S); -enum class CudaArch { +enum class GpuArch { UNUSED, UNKNOWN, // TODO: Deprecate and remove GPU architectures older than sm_52. @@ -133,8 +133,8 @@ enum class CudaArch { // public one. LAST, - CudaDefault = CudaArch::SM_52, - HIPDefault = CudaArch::GFX906, + CudaDefault = GpuArch::SM_52, + HIPDefault = GpuArch::GFX906, }; enum class CUDAFunctionTarget { @@ -145,26 +145,26 @@ enum class CUDAFunctionTarget { InvalidTarget }; -static inline bool IsNVIDIAGpuArch(CudaArch A) { - return A >= CudaArch::SM_20 && A < CudaArch::GFX600; +static inline bool IsNVIDIAGpuArch(GpuArch A) { + return A >= GpuArch::SM_20 && A < GpuArch::GFX600; } -static inline bool IsAMDGpuArch(CudaArch A) { +static inline bool IsAMDGpuArch(GpuArch A) { // Generic processor model is for testing only. - return A >= CudaArch::GFX600 && A < CudaArch::Generic; + return A >= GpuArch::GFX600 && A < GpuArch::Generic; } -const char *CudaArchToString(CudaArch A); -const char *CudaArchToVirtualArchString(CudaArch A); +const char *GpuArchToString(GpuArch A); +const char *GpuArchToVirtualArchString(GpuArch A); // The input should have the form "sm_20". -CudaArch StringToCudaArch(llvm::StringRef S); +GpuArch StringToGpuArch(llvm::StringRef S); -/// Get the earliest CudaVersion that supports the given CudaArch. -CudaVersion MinVersionForCudaArch(CudaArch A); +/// Get the earliest CudaVersion that supports the given GpuArch. +CudaVersion MinVersionForGpuArch(GpuArch A); -/// Get the latest CudaVersion that supports the given CudaArch. -CudaVersion MaxVersionForCudaArch(CudaArch A); +/// Get the latest CudaVersion that supports the given GpuArch. +CudaVersion MaxVersionForGpuArch(GpuArch A); // Various SDK-dependent features that affect CUDA compilation enum class CudaFeature { diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index af99c4d61021e..e31f09dce0f3a 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -72,23 +72,21 @@ CudaVersion ToCudaVersion(llvm::VersionTuple Version) { } namespace { -struct CudaArchToStringMap { - CudaArch arch; +struct GpuArchToStringMap { + GpuArch arch; const char *arch_name; const char *virtual_arch_name; }; } // namespace -#define SM2(sm, ca) \ - { CudaArch::SM_##sm, "sm_" #sm, ca } +#define SM2(sm, ca) {GpuArch::SM_##sm, "sm_" #sm, ca} #define SM(sm) SM2(sm, "compute_" #sm) -#define GFX(gpu) \ - { CudaArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" } -static const CudaArchToStringMap arch_names[] = { +#define GFX(gpu) {GpuArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"} +static const GpuArchToStringMap arch_names[] = { // clang-format off -{CudaArch::UNUSED, "", ""}, +{GpuArch::UNUSED, "", ""}, SM2(20, "compute_20"), SM2(21, "compute_20"), // Fermi -SM(30), {CudaArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37), // Kepler +SM(30), {GpuArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37), // Kepler SM(50), SM(52), SM(53), // Maxwell SM(60), SM(61), SM(62), // Pascal SM(70), SM(72), // Volta @@ -112,7 +110,7 @@ static const CudaArchToStringMap arch_names[] = { GFX(803), // gfx803 GFX(805), // gfx805 GFX(810), // gfx810 -{CudaA
[clang] [CUDA][NFC] CudaArch to GpuArch rename (PR #97028)
jchlanda wrote: This originally was submitted as a ticket against a fork of llvm in here: https://github.com/intel/llvm/issues/4279 https://github.com/llvm/llvm-project/pull/97028 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][NFC] CudaArch to GpuArch rename (PR #97028)
@@ -52,7 +52,7 @@ const char *CudaVersionToString(CudaVersion V); // Input is "Major.Minor" CudaVersion CudaStringToVersion(const llvm::Twine &S); -enum class CudaArch { +enum class GpuArch { jchlanda wrote: Good idea, let me rename it real quick. https://github.com/llvm/llvm-project/pull/97028 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][NFC] CudaArch to GpuArch rename (PR #97028)
https://github.com/jchlanda updated https://github.com/llvm/llvm-project/pull/97028 >From aa37f9f9df35c0464850fe95ca339545c5431de8 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 28 Jun 2024 09:25:56 + Subject: [PATCH] [CUDA][NFC] CudaArch to OffloadArch rename Rename CudaArch to OffloadArch to better reflect its content and the use. Apply a similar rename to helpers handling the enum. --- clang/include/clang/Basic/Cuda.h | 28 +-- clang/lib/Basic/Cuda.cpp | 110 ++-- clang/lib/Basic/Targets/NVPTX.cpp| 160 - clang/lib/Basic/Targets/NVPTX.h | 20 +-- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 173 +-- clang/lib/Driver/Driver.cpp | 43 ++--- clang/lib/Driver/OffloadBundler.cpp | 3 +- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 2 +- clang/lib/Driver/ToolChains/Cuda.cpp | 38 ++-- clang/lib/Driver/ToolChains/Cuda.h | 4 +- clang/lib/Sema/SemaDeclAttr.cpp | 14 +- 11 files changed, 297 insertions(+), 298 deletions(-) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 01cfe286c491b..83699f8897f66 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -52,7 +52,7 @@ const char *CudaVersionToString(CudaVersion V); // Input is "Major.Minor" CudaVersion CudaStringToVersion(const llvm::Twine &S); -enum class CudaArch { +enum class OffloadArch { UNUSED, UNKNOWN, // TODO: Deprecate and remove GPU architectures older than sm_52. @@ -133,8 +133,8 @@ enum class CudaArch { // public one. LAST, - CudaDefault = CudaArch::SM_52, - HIPDefault = CudaArch::GFX906, + CudaDefault = OffloadArch::SM_52, + HIPDefault = OffloadArch::GFX906, }; enum class CUDAFunctionTarget { @@ -145,26 +145,26 @@ enum class CUDAFunctionTarget { InvalidTarget }; -static inline bool IsNVIDIAGpuArch(CudaArch A) { - return A >= CudaArch::SM_20 && A < CudaArch::GFX600; +static inline bool IsNVIDIAOffloadArch(OffloadArch A) { + return A >= OffloadArch::SM_20 && A < OffloadArch::GFX600; } -static inline bool IsAMDGpuArch(CudaArch A) { +static inline bool IsAMDOffloadArch(OffloadArch A) { // Generic processor model is for testing only. - return A >= CudaArch::GFX600 && A < CudaArch::Generic; + return A >= OffloadArch::GFX600 && A < OffloadArch::Generic; } -const char *CudaArchToString(CudaArch A); -const char *CudaArchToVirtualArchString(CudaArch A); +const char *OffloadArchToString(OffloadArch A); +const char *OffloadArchToVirtualArchString(OffloadArch A); // The input should have the form "sm_20". -CudaArch StringToCudaArch(llvm::StringRef S); +OffloadArch StringToOffloadArch(llvm::StringRef S); -/// Get the earliest CudaVersion that supports the given CudaArch. -CudaVersion MinVersionForCudaArch(CudaArch A); +/// Get the earliest CudaVersion that supports the given OffloadArch. +CudaVersion MinVersionForOffloadArch(OffloadArch A); -/// Get the latest CudaVersion that supports the given CudaArch. -CudaVersion MaxVersionForCudaArch(CudaArch A); +/// Get the latest CudaVersion that supports the given OffloadArch. +CudaVersion MaxVersionForOffloadArch(OffloadArch A); // Various SDK-dependent features that affect CUDA compilation enum class CudaFeature { diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index af99c4d61021e..faf3878f064d2 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -72,23 +72,21 @@ CudaVersion ToCudaVersion(llvm::VersionTuple Version) { } namespace { -struct CudaArchToStringMap { - CudaArch arch; +struct OffloadArchToStringMap { + OffloadArch arch; const char *arch_name; const char *virtual_arch_name; }; } // namespace -#define SM2(sm, ca) \ - { CudaArch::SM_##sm, "sm_" #sm, ca } +#define SM2(sm, ca) {OffloadArch::SM_##sm, "sm_" #sm, ca} #define SM(sm) SM2(sm, "compute_" #sm) -#define GFX(gpu) \ - { CudaArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" } -static const CudaArchToStringMap arch_names[] = { +#define GFX(gpu) {OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"} +static const OffloadArchToStringMap arch_names[] = { // clang-format off -{CudaArch::UNUSED, "", ""}, +{OffloadArch::UNUSED, "", ""}, SM2(20, "compute_20"), SM2(21, "compute_20"), // Fermi -SM(30), {CudaArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37), // Kepler +SM(30), {OffloadArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37), // Kepler SM(50), SM(52), SM(53), // Maxwell SM(60), SM(61), SM(62), // Pascal SM(70), SM(72), // Volta @@ -112,7 +110,7 @@ static const CudaArchToStringMap arch_names[] = { GFX(803), // gfx803 GFX(805), // gfx805 GFX(810), // gfx810 -{Cuda
[clang] [CUDA][NFC] CudaArch to GpuArch rename (PR #97028)
@@ -52,7 +52,7 @@ const char *CudaVersionToString(CudaVersion V); // Input is "Major.Minor" CudaVersion CudaStringToVersion(const llvm::Twine &S); -enum class CudaArch { +enum class GpuArch { jchlanda wrote: Done. https://github.com/llvm/llvm-project/pull/97028 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][NFC] CudaArch to OffloadArch rename (PR #97028)
https://github.com/jchlanda edited https://github.com/llvm/llvm-project/pull/97028 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][NFC] CudaArch to OffloadArch rename (PR #97028)
https://github.com/jchlanda edited https://github.com/llvm/llvm-project/pull/97028 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][NFC] CudaArch to OffloadArch rename (PR #97028)
@@ -52,7 +52,7 @@ const char *CudaVersionToString(CudaVersion V); // Input is "Major.Minor" CudaVersion CudaStringToVersion(const llvm::Twine &S); -enum class CudaArch { +enum class GpuArch { jchlanda wrote: I've folded the commits to keep it clean. https://github.com/llvm/llvm-project/pull/97028 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][NFC] CudaArch to OffloadArch rename (PR #97028)
https://github.com/jchlanda updated https://github.com/llvm/llvm-project/pull/97028 >From 2d61a382fa66d9cc23cd6c78657a6161edf5b5c2 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 28 Jun 2024 09:25:56 + Subject: [PATCH] [CUDA][NFC] CudaArch to OffloadArch rename Rename CudaArch to OffloadArch to better reflect its content and the use. Apply a similar rename to helpers handling the enum. --- clang/include/clang/Basic/Cuda.h | 28 +-- clang/lib/Basic/Cuda.cpp | 110 ++-- clang/lib/Basic/Targets/NVPTX.cpp| 160 - clang/lib/Basic/Targets/NVPTX.h | 20 +-- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 173 +-- clang/lib/Driver/Driver.cpp | 43 ++--- clang/lib/Driver/OffloadBundler.cpp | 3 +- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 2 +- clang/lib/Driver/ToolChains/Cuda.cpp | 38 ++-- clang/lib/Driver/ToolChains/Cuda.h | 4 +- clang/lib/Sema/SemaDeclAttr.cpp | 14 +- 11 files changed, 297 insertions(+), 298 deletions(-) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 01cfe286c491b..83699f8897f66 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -52,7 +52,7 @@ const char *CudaVersionToString(CudaVersion V); // Input is "Major.Minor" CudaVersion CudaStringToVersion(const llvm::Twine &S); -enum class CudaArch { +enum class OffloadArch { UNUSED, UNKNOWN, // TODO: Deprecate and remove GPU architectures older than sm_52. @@ -133,8 +133,8 @@ enum class CudaArch { // public one. LAST, - CudaDefault = CudaArch::SM_52, - HIPDefault = CudaArch::GFX906, + CudaDefault = OffloadArch::SM_52, + HIPDefault = OffloadArch::GFX906, }; enum class CUDAFunctionTarget { @@ -145,26 +145,26 @@ enum class CUDAFunctionTarget { InvalidTarget }; -static inline bool IsNVIDIAGpuArch(CudaArch A) { - return A >= CudaArch::SM_20 && A < CudaArch::GFX600; +static inline bool IsNVIDIAOffloadArch(OffloadArch A) { + return A >= OffloadArch::SM_20 && A < OffloadArch::GFX600; } -static inline bool IsAMDGpuArch(CudaArch A) { +static inline bool IsAMDOffloadArch(OffloadArch A) { // Generic processor model is for testing only. - return A >= CudaArch::GFX600 && A < CudaArch::Generic; + return A >= OffloadArch::GFX600 && A < OffloadArch::Generic; } -const char *CudaArchToString(CudaArch A); -const char *CudaArchToVirtualArchString(CudaArch A); +const char *OffloadArchToString(OffloadArch A); +const char *OffloadArchToVirtualArchString(OffloadArch A); // The input should have the form "sm_20". -CudaArch StringToCudaArch(llvm::StringRef S); +OffloadArch StringToOffloadArch(llvm::StringRef S); -/// Get the earliest CudaVersion that supports the given CudaArch. -CudaVersion MinVersionForCudaArch(CudaArch A); +/// Get the earliest CudaVersion that supports the given OffloadArch. +CudaVersion MinVersionForOffloadArch(OffloadArch A); -/// Get the latest CudaVersion that supports the given CudaArch. -CudaVersion MaxVersionForCudaArch(CudaArch A); +/// Get the latest CudaVersion that supports the given OffloadArch. +CudaVersion MaxVersionForOffloadArch(OffloadArch A); // Various SDK-dependent features that affect CUDA compilation enum class CudaFeature { diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index af99c4d61021e..faf3878f064d2 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -72,23 +72,21 @@ CudaVersion ToCudaVersion(llvm::VersionTuple Version) { } namespace { -struct CudaArchToStringMap { - CudaArch arch; +struct OffloadArchToStringMap { + OffloadArch arch; const char *arch_name; const char *virtual_arch_name; }; } // namespace -#define SM2(sm, ca) \ - { CudaArch::SM_##sm, "sm_" #sm, ca } +#define SM2(sm, ca) {OffloadArch::SM_##sm, "sm_" #sm, ca} #define SM(sm) SM2(sm, "compute_" #sm) -#define GFX(gpu) \ - { CudaArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" } -static const CudaArchToStringMap arch_names[] = { +#define GFX(gpu) {OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"} +static const OffloadArchToStringMap arch_names[] = { // clang-format off -{CudaArch::UNUSED, "", ""}, +{OffloadArch::UNUSED, "", ""}, SM2(20, "compute_20"), SM2(21, "compute_20"), // Fermi -SM(30), {CudaArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37), // Kepler +SM(30), {OffloadArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37), // Kepler SM(50), SM(52), SM(53), // Maxwell SM(60), SM(61), SM(62), // Pascal SM(70), SM(72), // Volta @@ -112,7 +110,7 @@ static const CudaArchToStringMap arch_names[] = { GFX(803), // gfx803 GFX(805), // gfx805 GFX(810), // gfx810 -{Cuda
[clang] [CUDA][NFC] CudaArch to OffloadArch rename (PR #97028)
https://github.com/jchlanda updated https://github.com/llvm/llvm-project/pull/97028 >From 7424d33961400546dfb142ec220e59bb78fe1c82 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 28 Jun 2024 09:25:56 + Subject: [PATCH] [CUDA][NFC] CudaArch to OffloadArch rename Rename CudaArch to OffloadArch to better reflect its content and the use. Apply a similar rename to helpers handling the enum. --- clang/include/clang/Basic/Cuda.h | 28 +-- clang/lib/Basic/Cuda.cpp | 110 ++-- clang/lib/Basic/Targets/NVPTX.cpp| 160 - clang/lib/Basic/Targets/NVPTX.h | 20 +-- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 173 +-- clang/lib/Driver/Driver.cpp | 43 ++--- clang/lib/Driver/OffloadBundler.cpp | 3 +- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 2 +- clang/lib/Driver/ToolChains/Cuda.cpp | 38 ++-- clang/lib/Driver/ToolChains/Cuda.h | 4 +- clang/lib/Sema/SemaDeclAttr.cpp | 14 +- 11 files changed, 297 insertions(+), 298 deletions(-) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 01cfe286c491b..83699f8897f66 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -52,7 +52,7 @@ const char *CudaVersionToString(CudaVersion V); // Input is "Major.Minor" CudaVersion CudaStringToVersion(const llvm::Twine &S); -enum class CudaArch { +enum class OffloadArch { UNUSED, UNKNOWN, // TODO: Deprecate and remove GPU architectures older than sm_52. @@ -133,8 +133,8 @@ enum class CudaArch { // public one. LAST, - CudaDefault = CudaArch::SM_52, - HIPDefault = CudaArch::GFX906, + CudaDefault = OffloadArch::SM_52, + HIPDefault = OffloadArch::GFX906, }; enum class CUDAFunctionTarget { @@ -145,26 +145,26 @@ enum class CUDAFunctionTarget { InvalidTarget }; -static inline bool IsNVIDIAGpuArch(CudaArch A) { - return A >= CudaArch::SM_20 && A < CudaArch::GFX600; +static inline bool IsNVIDIAOffloadArch(OffloadArch A) { + return A >= OffloadArch::SM_20 && A < OffloadArch::GFX600; } -static inline bool IsAMDGpuArch(CudaArch A) { +static inline bool IsAMDOffloadArch(OffloadArch A) { // Generic processor model is for testing only. - return A >= CudaArch::GFX600 && A < CudaArch::Generic; + return A >= OffloadArch::GFX600 && A < OffloadArch::Generic; } -const char *CudaArchToString(CudaArch A); -const char *CudaArchToVirtualArchString(CudaArch A); +const char *OffloadArchToString(OffloadArch A); +const char *OffloadArchToVirtualArchString(OffloadArch A); // The input should have the form "sm_20". -CudaArch StringToCudaArch(llvm::StringRef S); +OffloadArch StringToOffloadArch(llvm::StringRef S); -/// Get the earliest CudaVersion that supports the given CudaArch. -CudaVersion MinVersionForCudaArch(CudaArch A); +/// Get the earliest CudaVersion that supports the given OffloadArch. +CudaVersion MinVersionForOffloadArch(OffloadArch A); -/// Get the latest CudaVersion that supports the given CudaArch. -CudaVersion MaxVersionForCudaArch(CudaArch A); +/// Get the latest CudaVersion that supports the given OffloadArch. +CudaVersion MaxVersionForOffloadArch(OffloadArch A); // Various SDK-dependent features that affect CUDA compilation enum class CudaFeature { diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index af99c4d61021e..faf3878f064d2 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -72,23 +72,21 @@ CudaVersion ToCudaVersion(llvm::VersionTuple Version) { } namespace { -struct CudaArchToStringMap { - CudaArch arch; +struct OffloadArchToStringMap { + OffloadArch arch; const char *arch_name; const char *virtual_arch_name; }; } // namespace -#define SM2(sm, ca) \ - { CudaArch::SM_##sm, "sm_" #sm, ca } +#define SM2(sm, ca) {OffloadArch::SM_##sm, "sm_" #sm, ca} #define SM(sm) SM2(sm, "compute_" #sm) -#define GFX(gpu) \ - { CudaArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" } -static const CudaArchToStringMap arch_names[] = { +#define GFX(gpu) {OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"} +static const OffloadArchToStringMap arch_names[] = { // clang-format off -{CudaArch::UNUSED, "", ""}, +{OffloadArch::UNUSED, "", ""}, SM2(20, "compute_20"), SM2(21, "compute_20"), // Fermi -SM(30), {CudaArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37), // Kepler +SM(30), {OffloadArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37), // Kepler SM(50), SM(52), SM(53), // Maxwell SM(60), SM(61), SM(62), // Pascal SM(70), SM(72), // Volta @@ -112,7 +110,7 @@ static const CudaArchToStringMap arch_names[] = { GFX(803), // gfx803 GFX(805), // gfx805 GFX(810), // gfx810 -{Cuda
[clang] [AMDGPU] Enable overriding of OpenCL's default address space (PR #117588)
https://github.com/jchlanda created https://github.com/llvm/llvm-project/pull/117588 `opencl-def-is-generic-addrspace` sets the default address space from private to generic. This feature allows for building bitcode libraries written in OpenCL that can then be linked against modules compiled from sources written in languages that expect generic as the default address space. >From ba7dcef2bb18d1d1305847e3b647fe63fd344809 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Mon, 25 Nov 2024 18:09:36 + Subject: [PATCH] [AMDGPU] Enable overriding of OpenCL's default address space `opencl-def-is-generic-addrspace` sets the default address space from private to generic. This feature allows for building bitcode libraries written in OpenCL that can then be linked against modules compiled from sources written in languages that expect generic as the default address space. --- clang/include/clang/Basic/TargetInfo.h | 3 +++ clang/include/clang/Basic/TargetOptions.h | 4 clang/include/clang/Driver/Options.td | 7 +++ clang/lib/Basic/TargetInfo.cpp | 1 + clang/lib/Basic/Targets/AMDGPU.cpp | 10 -- clang/lib/Driver/ToolChains/Clang.cpp | 2 ++ .../AMDGPU/opencl_def_is_generic_addrspace.cl | 18 ++ 7 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/AMDGPU/opencl_def_is_generic_addrspace.cl diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 9cd23d123f2bac..e2f2c2fafa128d 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -271,6 +271,9 @@ class TargetInfo : public TransferrableTargetInfo, LLVM_PREFERRED_TYPE(bool) unsigned AllowAMDGPUUnsafeFPAtomics : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned OpenCLDefIsGenericAddrSpace : 1; + LLVM_PREFERRED_TYPE(bool) unsigned HasUnalignedAccess : 1; diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h index 2049f03b28893f..5e28586ce49c77 100644 --- a/clang/include/clang/Basic/TargetOptions.h +++ b/clang/include/clang/Basic/TargetOptions.h @@ -78,6 +78,10 @@ class TargetOptions { /// \brief If enabled, allow AMDGPU unsafe floating point atomics. bool AllowAMDGPUUnsafeFPAtomics = false; + /// \brief If enabled, allow overriding of the default address space (from + /// private to generic). + bool OpenCLDefIsGenericAddrSpace = false; + /// \brief Code object version for AMDGPU. llvm::CodeObjectVersionKind CodeObjectVersion = llvm::CodeObjectVersionKind::COV_None; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 40fd48761928b3..5f46d417879b79 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5164,6 +5164,13 @@ defm unsafe_fp_atomics : BoolMOption<"unsafe-fp-atomics", "for certain memory destinations. (AMDGPU only)">, NegFlag>; +defm opencl_def_is_generic_addrspace: BoolMOption<"opencl-def-is-generic-addrspace", + TargetOpts<"OpenCLDefIsGenericAddrSpace">, DefaultFalse, + PosFlag, + NegFlag>; + def faltivec : Flag<["-"], "faltivec">, Group; def fno_altivec : Flag<["-"], "fno-altivec">, Group; let Flags = [TargetSpecific] in { diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index 86befb1cbc74fc..d6f521895d882d 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -157,6 +157,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) { HasAArch64SVETypes = false; HasRISCVVTypes = false; AllowAMDGPUUnsafeFPAtomics = false; + OpenCLDefIsGenericAddrSpace = false; HasUnalignedAccess = false; ARMCDECoprocMask = 0; diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index 99f8f2944e2796..e31f22011faeb9 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -240,6 +240,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple, HasFloat16 = true; WavefrontSize = (GPUFeatures & llvm::AMDGPU::FEATURE_WAVE32) ? 32 : 64; AllowAMDGPUUnsafeFPAtomics = Opts.AllowAMDGPUUnsafeFPAtomics; + OpenCLDefIsGenericAddrSpace = Opts.OpenCLDefIsGenericAddrSpace; // Set pointer width and alignment for the generic address space. PointerWidth = PointerAlign = getPointerWidthV(LangAS::Default); @@ -262,8 +263,13 @@ void AMDGPUTargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) { // ToDo: There are still a few places using default address space as private // address space in OpenCL, which needs to be cleaned up, then the references // to OpenCL can be removed from the following line. - setAddressSpaceMap((Opts.OpenCL && !Opts.OpenCLGenericAddressSpace) || - !isAMDGCN(getTriple())); + bool DefaultIsPrivate = (Opts.OpenCL && !Opts.OpenCLGenericAddressSpace) ||
[clang] [AMDGPU] Enable overriding of OpenCL's default address space (PR #117588)
https://github.com/jchlanda closed https://github.com/llvm/llvm-project/pull/117588 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU] Enable overriding of OpenCL's default address space (PR #117588)
jchlanda wrote: > In principle I am against this, it adds a relatively brittle hook, and > bypasses the pre-existing mechanisms (use CL2 or enable the generic-as > extension) for obtaining this behaviour, in a way that does not ensure that > the pre-existing mechanisms are available (e.g. it appears one could pass the > option, without asking for the generic as extension on CL3.0). That is fair, after having a second look additional chack for `Opts.OpenCLGenericAddressSpace` that [you have added](https://github.com/llvm/llvm-project/pull/112442) recently, is all we need to make it work. Closing it now. https://github.com/llvm/llvm-project/pull/117588 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Pass -offload-lto instead of -lto for cuda/hip kernels (PR #125243)
@@ -498,12 +498,17 @@ Expected clang(ArrayRef InputFiles, const ArgList &Args) { }; // Forward all of the `--offload-opt` and similar options to the device. - CmdArgs.push_back("-flto"); for (auto &Arg : Args.filtered(OPT_offload_opt_eq_minus, OPT_mllvm)) CmdArgs.append( {"-Xlinker", Args.MakeArgString("--plugin-opt=" + StringRef(Arg->getValue()))}); + if (Triple.isNVPTX() || Triple.isAMDGPU()) { jchlanda wrote: No need for braces in single statement conditionals. https://github.com/llvm/llvm-project/pull/125243 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Pass -offload-lto instead of -lto for cuda/hip kernels (PR #125243)
https://github.com/jchlanda approved this pull request. https://github.com/llvm/llvm-project/pull/125243 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits