[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)

via cfe-commits Fri, 15 Sep 2023 04:25:23 -0700

llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang
            
<details>
<summary>Changes</summary>
Since SM_90 CUDA supports specifying additional argument to the launch_bounds 
attribute: maxBlocksPerCluster, to express the maximum number of CTAs that can 
be part of the cluster. See: 
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank
 and
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#launch-bounds 
for details.
--

Patch is 24.44 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/66496.diff

13 Files Affected:

- (modified) clang/include/clang/Basic/Attr.td (+2-1) 
- (modified) clang/include/clang/Basic/DiagnosticSemaKinds.td (+4) 
- (modified) clang/include/clang/Sema/Sema.h (+3-2) 
- (modified) clang/lib/CodeGen/Targets/NVPTX.cpp (+10-2) 
- (modified) clang/lib/Parse/ParseOpenMP.cpp (+2-1) 
- (modified) clang/lib/Sema/SemaDeclAttr.cpp (+39-7) 
- (modified) clang/lib/Sema/SemaTemplateInstantiateDecl.cpp (+9-1) 
- (modified) clang/test/CodeGenCUDA/launch-bounds.cu (+69) 
- (modified) clang/test/SemaCUDA/launch_bounds.cu (+3-1) 
- (added) clang/test/SemaCUDA/launch_bounds_sm_90.cu (+45) 
- (modified) llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp (+36-43) 
- (modified) llvm/lib/Target/NVPTX/NVPTXUtilities.cpp (+4) 
- (modified) llvm/lib/Target/NVPTX/NVPTXUtilities.h (+1) 


<pre>
diff --git a/clang/include/clang/Basic/Attr.td 
b/clang/include/clang/Basic/Attr.td
index c95db7e8049d47a..3c51261bd3eb081 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1267,7 +1267,8 @@ def CUDAInvalidTarget : InheritableAttr {
 
 def CUDALaunchBounds : InheritableAttr {
   let Spellings = [GNU&amp;lt;&amp;quot;launch_bounds&amp;quot;&amp;gt;, 
Declspec&amp;lt;&amp;quot;__launch_bounds__&amp;quot;&amp;gt;];
-  let Args = [ExprArgument&amp;lt;&amp;quot;MaxThreads&amp;quot;&amp;gt;, 
ExprArgument&amp;lt;&amp;quot;MinBlocks&amp;quot;, 1&amp;gt;];
+  let Args = [ExprArgument&amp;lt;&amp;quot;MaxThreads&amp;quot;&amp;gt;, 
ExprArgument&amp;lt;&amp;quot;MinBlocks&amp;quot;, 1&amp;gt;,
+              ExprArgument&amp;lt;&amp;quot;MaxBlocks&amp;quot;, 1&amp;gt;];
   let LangOpts = [CUDA];
   let Subjects = SubjectList&amp;lt;[ObjCMethod, FunctionLike]&amp;gt;;
   // An AST node is created for this attribute, but is not used by other parts
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 0ac4df8edb242f6..088e3a45c7babba 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11836,6 +11836,10 @@ def err_sycl_special_type_num_init_method : 
Error&amp;lt;
   &amp;quot;types with &amp;#x27;sycl_special_class&amp;#x27; attribute must 
have one and only one &amp;#x27;__init&amp;#x27; &amp;quot;
   &amp;quot;method defined&amp;quot;&amp;gt;;
 
+def warn_cuda_maxclusterrank_sm_90 : Warning&amp;lt;
+  &amp;quot;maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, 
ignoring &amp;quot;
+  &amp;quot;%1 attribute&amp;quot;&amp;gt;, 
InGroup&amp;lt;IgnoredAttributes&amp;gt;;
+
 def err_bit_int_bad_size : Error&amp;lt;&amp;quot;%select{signed|unsigned}0 
_BitInt must &amp;quot;
                                  &amp;quot;have a bit size of at least 
%select{2|1}0&amp;quot;&amp;gt;;
 def err_bit_int_max_size : Error&amp;lt;&amp;quot;%select{signed|unsigned}0 
_BitInt of bit &amp;quot;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 47379e00a7445e3..dca7b66da3796d9 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11051,12 +11051,13 @@ class Sema final {
   /// Create an CUDALaunchBoundsAttr attribute.
   CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo 
&amp;amp;CI,
                                                Expr *MaxThreads,
-                                               Expr *MinBlocks);
+                                               Expr *MinBlocks,
+                                               Expr *MaxBlocks);
 
   /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular
   /// declaration.
   void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &amp;amp;CI,
-                           Expr *MaxThreads, Expr *MinBlocks);
+                           Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
 
   /// AddModeAttr - Adds a mode attribute to a particular declaration.
   void AddModeAttr(Decl *D, const AttributeCommonInfo &amp;amp;CI, 
IdentifierInfo *Name,
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp 
b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 0d4bbd795648008..64d019a10514d60 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -296,8 +296,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(
     NVPTXTargetCodeGenInfo::addNVVMMetadata(F, &amp;quot;maxntidx&amp;quot;,
                                             MaxThreads.getExtValue());
 
-  // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was
-  // not specified in __launch_bounds__ or if the user specified a 0 value,
+  // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
+  // was not specified in __launch_bounds__ or if the user specified a 0 value,
   // we don&amp;#x27;t have to add a PTX directive.
   if (Attr-&amp;gt;getMinBlocks()) {
     llvm::APSInt MinBlocks(32);
@@ -307,6 +307,14 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(
       NVPTXTargetCodeGenInfo::addNVVMMetadata(F, &amp;quot;minctasm&amp;quot;,
                                               MinBlocks.getExtValue());
   }
+  if (Attr-&amp;gt;getMaxBlocks()) {
+    llvm::APSInt MaxBlocks(32);
+    MaxBlocks = 
Attr-&amp;gt;getMaxBlocks()-&amp;gt;EvaluateKnownConstInt(getContext());
+    if (MaxBlocks &amp;gt; 0)
+      // Create !{&amp;lt;func-ref&amp;gt;, metadata 
!&amp;quot;maxclusterrank&amp;quot;, i32 &amp;lt;val&amp;gt;} node
+      NVPTXTargetCodeGenInfo::addNVVMMetadata(F, 
&amp;quot;maxclusterrank&amp;quot;,
+                                              MaxBlocks.getExtValue());
+  }
 }
 
 std::unique_ptr&amp;lt;TargetCodeGenInfo&amp;gt;
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 605b97617432ed3..8a8a126bf7244d4 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3739,7 +3739,8 @@ OMPClause *Parser::ParseOpenMPOMPXAttributesClause(bool 
ParseOnly) {
         continue;
       if (auto *A = Actions.CreateLaunchBoundsAttr(
               PA, PA.getArgAsExpr(0),
-              PA.getNumArgs() &amp;gt; 1 ? PA.getArgAsExpr(1) : nullptr))
+              PA.getNumArgs() &amp;gt; 1 ? PA.getArgAsExpr(1) : nullptr,
+              PA.getNumArgs() &amp;gt; 2 ? PA.getArgAsExpr(2) : nullptr))
         Attrs.push_back(A);
       continue;
     default:
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index cc98713241395ec..e62a0d4fc29f9cd 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5607,6 +5607,21 @@ bool Sema::CheckRegparmAttr(const ParsedAttr 
&amp;amp;AL, unsigned &amp;amp;numParams) {
   return false;
 }
 
+// Helper to get CudaArch.
+static CudaArch getCudaArch(const TargetInfo &amp;amp;TI) {
+  if (!TI.hasFeature(&amp;quot;ptx&amp;quot;)) {
+    return CudaArch::UNKNOWN;
+  }
+  for (const auto &amp;amp;Feature : TI.getTargetOpts().FeatureMap) {
+    if (Feature.getValue()) {
+      CudaArch Arch = StringToCudaArch(Feature.getKey());
+      if (Arch != CudaArch::UNKNOWN)
+        return Arch;
+    }
+  }
+  return CudaArch::UNKNOWN;
+}
+
 // Checks whether an argument of launch_bounds attribute is
 // acceptable, performs implicit conversion to Rvalue, and returns
 // non-nullptr Expr result on success. Otherwise, it returns nullptr
@@ -5650,8 +5665,8 @@ static Expr *makeLaunchBoundsArgExpr(Sema &amp;amp;S, 
Expr *E,
 
 CUDALaunchBoundsAttr *
 Sema::CreateLaunchBoundsAttr(const AttributeCommonInfo &amp;amp;CI, Expr 
*MaxThreads,
-                             Expr *MinBlocks) {
-  CUDALaunchBoundsAttr TmpAttr(Context, CI, MaxThreads, MinBlocks);
+                             Expr *MinBlocks, Expr *MaxBlocks) {
+  CUDALaunchBoundsAttr TmpAttr(Context, CI, MaxThreads, MinBlocks, MaxBlocks);
   MaxThreads = makeLaunchBoundsArgExpr(*this, MaxThreads, TmpAttr, 0);
   if (MaxThreads == nullptr)
     return nullptr;
@@ -5662,22 +5677,39 @@ Sema::CreateLaunchBoundsAttr(const AttributeCommonInfo 
&amp;amp;CI, Expr *MaxThreads,
       return nullptr;
   }
 
+  if (MaxBlocks) {
+    // Feature &amp;#x27;.maxclusterrank&amp;#x27; requires .target sm_90 or 
higher.
+    auto SM = getCudaArch(Context.getTargetInfo());
+    if (SM == CudaArch::UNKNOWN || SM &amp;lt; CudaArch::SM_90) {
+      Diag(MaxBlocks-&amp;gt;getBeginLoc(), 
diag::warn_cuda_maxclusterrank_sm_90)
+          &amp;lt;&amp;lt; CudaArchToString(SM) &amp;lt;&amp;lt; CI 
&amp;lt;&amp;lt; MaxBlocks-&amp;gt;getSourceRange();
+      // Ignore it by setting MaxBlocks to null;
+      MaxBlocks = nullptr;
+    } else {
+      MaxBlocks = makeLaunchBoundsArgExpr(*this, MaxBlocks, TmpAttr, 2);
+      if (MaxBlocks == nullptr)
+        return nullptr;
+    }
+  }
+
   return ::new (Context)
-      CUDALaunchBoundsAttr(Context, CI, MaxThreads, MinBlocks);
+      CUDALaunchBoundsAttr(Context, CI, MaxThreads, MinBlocks, MaxBlocks);
 }
 
 void Sema::AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &amp;amp;CI,
-                               Expr *MaxThreads, Expr *MinBlocks) {
-  if (auto *Attr = CreateLaunchBoundsAttr(CI, MaxThreads, MinBlocks))
+                               Expr *MaxThreads, Expr *MinBlocks,
+                               Expr *MaxBlocks) {
+  if (auto *Attr = CreateLaunchBoundsAttr(CI, MaxThreads, MinBlocks, 
MaxBlocks))
     D-&amp;gt;addAttr(Attr);
 }
 
 static void handleLaunchBoundsAttr(Sema &amp;amp;S, Decl *D, const ParsedAttr 
&amp;amp;AL) {
-  if (!AL.checkAtLeastNumArgs(S, 1) || !AL.checkAtMostNumArgs(S, 2))
+  if (!AL.checkAtLeastNumArgs(S, 1) || !AL.checkAtMostNumArgs(S, 3))
     return;
 
   S.AddLaunchBoundsAttr(D, AL, AL.getArgAsExpr(0),
-                        AL.getNumArgs() &amp;gt; 1 ? AL.getArgAsExpr(1) : 
nullptr);
+                        AL.getNumArgs() &amp;gt; 1 ? AL.getArgAsExpr(1) : 
nullptr,
+                        AL.getNumArgs() &amp;gt; 2 ? AL.getArgAsExpr(2) : 
nullptr);
 }
 
 static void handleArgumentWithTypeTagAttr(Sema &amp;amp;S, Decl *D,
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp 
b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 37a7d6204413a38..3f7268f5450a6fa 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -302,7 +302,15 @@ static void instantiateDependentCUDALaunchBoundsAttr(
     MinBlocks = Result.getAs&amp;lt;Expr&amp;gt;();
   }
 
-  S.AddLaunchBoundsAttr(New, Attr, MaxThreads, MinBlocks);
+  Expr *MaxBlocks = nullptr;
+  if (Attr.getMaxBlocks()) {
+    Result = S.SubstExpr(Attr.getMaxBlocks(), TemplateArgs);
+    if (Result.isInvalid())
+      return;
+    MaxBlocks = Result.getAs&amp;lt;Expr&amp;gt;();
+  }
+
+  S.AddLaunchBoundsAttr(New, Attr, MaxThreads, MinBlocks, MaxBlocks);
 }
 
 static void
diff --git a/clang/test/CodeGenCUDA/launch-bounds.cu 
b/clang/test/CodeGenCUDA/launch-bounds.cu
index 58bcc410201f35f..31ca9216b413e92 100644
--- a/clang/test/CodeGenCUDA/launch-bounds.cu
+++ b/clang/test/CodeGenCUDA/launch-bounds.cu
@@ -1,9 +1,13 @@
 // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device 
-emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -target-cpu sm_90 
-DUSE_MAX_BLOCKS -fcuda-is-device -emit-llvm -o - | FileCheck 
-check-prefix=CHECK_MAX_BLOCKS %s
 
 #include &amp;quot;Inputs/cuda.h&amp;quot;
 
 #define MAX_THREADS_PER_BLOCK 256
 #define MIN_BLOCKS_PER_MP     2
+#ifdef USE_MAX_BLOCKS
+#define MAX_BLOCKS_PER_MP     4
+#endif
 
 // Test both max threads per block and Min cta per sm.
 extern &amp;quot;C&amp;quot; {
@@ -17,6 +21,21 @@ Kernel1()
 // CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !&amp;quot;maxntidx&amp;quot;, i32 256}
 // CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !&amp;quot;minctasm&amp;quot;, i32 2}
 
+#ifdef USE_MAX_BLOCKS
+// Test max threads per block and min/max cta per sm.
+extern &amp;quot;C&amp;quot; {
+__global__ void
+__launch_bounds__( MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP 
)
+Kernel1_sm_90()
+{
+}
+}
+
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, 
!&amp;quot;maxntidx&amp;quot;, i32 256}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, 
!&amp;quot;minctasm&amp;quot;, i32 2}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, 
!&amp;quot;maxclusterrank&amp;quot;, i32 4}
+#endif // USE_MAX_BLOCKS
+
 // Test only max threads per block. Min cta per sm defaults to 0, and
 // CodeGen doesn&amp;#x27;t output a zero value for minctasm.
 extern &amp;quot;C&amp;quot; {
@@ -50,6 +69,20 @@ template __global__ void 
Kernel4&amp;lt;MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP&amp;gt;();
 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, 
!&amp;quot;maxntidx&amp;quot;, i32 256}
 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, 
!&amp;quot;minctasm&amp;quot;, i32 2}
 
+#ifdef USE_MAX_BLOCKS
+template &amp;lt;int max_threads_per_block, int min_blocks_per_mp, int 
max_blocks_per_mp&amp;gt;
+__global__ void
+__launch_bounds__(max_threads_per_block, min_blocks_per_mp, max_blocks_per_mp)
+Kernel4_sm_90()
+{
+}
+template __global__ void Kernel4_sm_90&amp;lt;MAX_THREADS_PER_BLOCK, 
MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP&amp;gt;();
+
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, 
!&amp;quot;maxntidx&amp;quot;, i32 256}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, 
!&amp;quot;minctasm&amp;quot;, i32 2}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, 
!&amp;quot;maxclusterrank&amp;quot;, i32 4}
+#endif //USE_MAX_BLOCKS
+
 const int constint = 100;
 template &amp;lt;int max_threads_per_block, int min_blocks_per_mp&amp;gt;
 __global__ void
@@ -63,6 +96,23 @@ template __global__ void 
Kernel5&amp;lt;MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP&amp;gt;();
 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, 
!&amp;quot;maxntidx&amp;quot;, i32 356}
 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, 
!&amp;quot;minctasm&amp;quot;, i32 258}
 
+#ifdef USE_MAX_BLOCKS
+
+template &amp;lt;int max_threads_per_block, int min_blocks_per_mp, int 
max_blocks_per_mp&amp;gt;
+__global__ void
+__launch_bounds__(max_threads_per_block + constint,
+                  min_blocks_per_mp + max_threads_per_block,
+                  max_blocks_per_mp + max_threads_per_block)
+Kernel5_sm_90()
+{
+}
+template __global__ void Kernel5_sm_90&amp;lt;MAX_THREADS_PER_BLOCK, 
MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP&amp;gt;();
+
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, 
!&amp;quot;maxntidx&amp;quot;, i32 356}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, 
!&amp;quot;minctasm&amp;quot;, i32 258}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, 
!&amp;quot;maxclusterrank&amp;quot;, i32 260}
+#endif //USE_MAX_BLOCKS
+
 // Make sure we don&amp;#x27;t emit negative launch bounds values.
 __global__ void
 __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
@@ -80,7 +130,26 @@ Kernel7()
 // CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, 
!&amp;quot;maxntidx&amp;quot;,
 // CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, 
!&amp;quot;minctasm&amp;quot;,
 
+#ifdef USE_MAX_BLOCKS
+__global__ void
+__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, 
-MAX_BLOCKS_PER_MP )
+Kernel7_sm_90()
+{
+}
+// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, 
!&amp;quot;maxntidx&amp;quot;,
+// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, 
!&amp;quot;minctasm&amp;quot;,
+// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, 
!&amp;quot;maxclusterrank&amp;quot;,
+#endif // USE_MAX_BLOCKS
+
 const char constchar = 12;
 __global__ void __launch_bounds__(constint, constchar) Kernel8() {}
 // CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, 
!&amp;quot;maxntidx&amp;quot;, i32 100
 // CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, 
!&amp;quot;minctasm&amp;quot;, i32 12
+
+#ifdef USE_MAX_BLOCKS
+const char constchar_2 = 14;
+__global__ void __launch_bounds__(constint, constchar, constchar_2) 
Kernel8_sm_90() {}
+// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, 
!&amp;quot;maxntidx&amp;quot;, i32 100
+// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, 
!&amp;quot;minctasm&amp;quot;, i32 12
+// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, 
!&amp;quot;maxclusterrank&amp;quot;, i32 14
+#endif // USE_MAX_BLOCKS
diff --git a/clang/test/SemaCUDA/launch_bounds.cu 
b/clang/test/SemaCUDA/launch_bounds.cu
index 0ca0c0145d8bbb6..b1f29480da30c65 100644
--- a/clang/test/SemaCUDA/launch_bounds.cu
+++ b/clang/test/SemaCUDA/launch_bounds.cu
@@ -12,7 +12,7 @@ __launch_bounds__(0x10000000000000000) void 
TestWayTooBigArg(void); // expected-
 __launch_bounds__(-128, 7) void TestNegArg1(void); // expected-warning 
{{&amp;#x27;launch_bounds&amp;#x27; attribute parameter 0 is negative and will 
be ignored}}
 __launch_bounds__(128, -7) void TestNegArg2(void); // expected-warning 
{{&amp;#x27;launch_bounds&amp;#x27; attribute parameter 1 is negative and will 
be ignored}}
 
-__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-error 
{{&amp;#x27;launch_bounds&amp;#x27; attribute takes no more than 2 arguments}}
+__launch_bounds__(1, 2, 3, 4) void Test4Args(void); // expected-error 
{{&amp;#x27;launch_bounds&amp;#x27; attribute takes no more than 3 arguments}}
 __launch_bounds__() void TestNoArgs(void); // expected-error 
{{&amp;#x27;launch_bounds&amp;#x27; attribute takes at least 1 argument}}
 
 int TestNoFunction __launch_bounds__(128, 7); // expected-warning 
{{&amp;#x27;launch_bounds&amp;#x27; attribute only applies to Objective-C 
methods, functions, and function pointers}}
@@ -47,3 +47,5 @@ __launch_bounds__(Args) void TestTemplateVariadicArgs(void) 
{} // expected-error
 
 template &amp;lt;int... Args&amp;gt;
 __launch_bounds__(1, Args) void TestTemplateVariadicArgs2(void) {} // 
expected-error {{expression contains unexpanded parameter pack 
&amp;#x27;Args&amp;#x27;}}
+
+__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-warning 
{{maxclusterrank requires sm_90 or higher, CUDA arch provided: unknown, 
ignoring &amp;#x27;launch_bounds&amp;#x27; attribute}}
diff --git a/clang/test/SemaCUDA/launch_bounds_sm_90.cu 
b/clang/test/SemaCUDA/launch_bounds_sm_90.cu
new file mode 100644
index 000000000000000..6b2369983b74fbb
--- /dev/null
+++ b/clang/test/SemaCUDA/launch_bounds_sm_90.cu
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -triple nvptx-unknown-unknown 
-target-cpu sm_90  -verify %s
+
+#include &amp;quot;Inputs/cuda.h&amp;quot;
+
+__launch_bounds__(128, 7) void Test2Args(void);
+__launch_bounds__(128) void Test1Arg(void);
+
+__launch_bounds__(0xffffffff) void TestMaxArg(void);
+__launch_bounds__(0x100000000) void TestTooBigArg(void); // expected-error 
{{integer constant expression evaluates to value 4294967296 that cannot be 
represented in a 32-bit unsigned integer type}}
+__launch_bounds__(0x10000000000000000) void TestWayTooBigArg(void); // 
expected-error {{integer literal is too large to be represented in any integer 
type}}
+__launch_bounds__(1, 1, 0x10000000000000000) void TestWayTooBigArg(void); // 
expected-error {{integer literal is too large to be represented in any integer 
type}}
+
+__launch_bounds__(-128, 7) void TestNegArg1(void); // expected-warning 
{{&amp;#x27;launch_bounds&amp;#x27; attribute parameter 0 is negative and will 
be ignored}}
+__launch_bounds__(128, -7) void TestNegArg2(void); // expected-warning 
{{&amp;#x27;launch_bounds&amp;#x27; attribute parameter 1 is negative and will 
be ignored}}
+__launch_bounds__(128, 1, -7) void TestNegArg2(void); // expected-warning 
{{&amp;#x27;launch_bounds&amp;#x27; attribute parameter 2 is negative and will 
be ignored}}
+
+
+__launch_bounds__(1, 2, 3, 4) void Test4Args(void); // expected-error 
{{&amp;#x27;launch_bounds&amp;#x27; attribute takes no more than 3 arguments}}
+__launch_bounds__() void TestNoArgs(void); // expected-error 
{{&amp;#x27;launch_bounds&amp;#x27; attribute takes at least 1 argument}}
+
+int TestNoFunction __launch_bounds__(128, 7, 13); // expected-warning 
{{&amp;#x27;launch_bounds&amp;#x27; attribute only applies to Objective-C 
methods, functions, and function pointers}}
+
+__launch_bounds__(true) void TestBool(void);
+__launch_bounds__(128, 1, 128.0) void TestFP(void); // expected-error 
{{&amp;#x27;launch_bounds&amp;#x27; attribute requires parameter 2 to be an 
integer constant}}
+__launch_bounds__(128, 1, (void*)0) void TestNullptr(void); // expected-error 
{{&amp;#x27;launch_bounds&amp;#x27; ...
<truncated>
</pre>
</details>


https://github.com/llvm/llvm-project/pull/66496
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [NVPTX] Add support for maxclusterrank in launch_bounds (PR #66496)

Reply via email to