https://github.com/kevinsala created 
https://github.com/llvm/llvm-project/pull/198719

This PR adds the option 
`-openmp-ir-builder-use-default-max-thread=<boolean-value>` to enable or 
disable the use of a default max threads in OpenMPIRBuilder when no max threads 
constant is provided. The option is enabled by default to maintain the same 
behavior as it is currently.

This flag is useful to avoid limiting the number of threads that an OpenMP 
target region can run with when no `thread_limit` or `num_threads` (in a nested 
parallel region) are specified. This flag may be used when recording a kernel 
to allow replaying it later with a higher number of threads (e.g., reaching the 
maximum thread limit supported by the device).

>From 76e503025baf29bd1f4e876e245f32e4efbd7b99 Mon Sep 17 00:00:00 2001
From: Kevin Sala <[email protected]>
Date: Wed, 20 May 2026 00:40:11 -0700
Subject: [PATCH] [llvm][OpenMP] Add option to disable default max threads
 adjustment

This commit adds the option 
-openmp-ir-builder-use-default-max-thread=<boolean-value>
to enable or disable the use of a default max threads in OpenMPIRBuilder when 
no max
threads are provided. The option is enabled by default, maintaining the same 
behavior
as it is currently.
---
 clang/test/OpenMP/thread_limit_gpu.c      | 29 +++++++++++++++++++----
 clang/test/OpenMP/thread_limit_nvptx.c    | 26 +++++++++++++-------
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 10 +++++---
 3 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/clang/test/OpenMP/thread_limit_gpu.c 
b/clang/test/OpenMP/thread_limit_gpu.c
index 829b0a1b02d22..c976e9e72b1ba 100644
--- a/clang/test/OpenMP/thread_limit_gpu.c
+++ b/clang/test/OpenMP/thread_limit_gpu.c
@@ -1,8 +1,10 @@
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu 
-fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa 
-fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device 
-fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck 
-check-prefixes=CHECK,CHECK-AMDGPU %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa 
-fopenmp-targets=amdgcn-amd-amdhsa -mllvm 
-openmp-ir-builder-use-default-max-threads=false -emit-llvm %s 
-fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | 
FileCheck -check-prefixes=CHECK,CHECK-AMDGPU-FLAG %s
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu 
-fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-x86-spirv-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple spirv64-intel 
-fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device 
-fopenmp-host-ir-file-path %t-x86-spirv-host.bc -o - | FileCheck 
-check-prefixes=CHECK,CHECK-SPIRV %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple spirv64-intel 
-fopenmp-targets=spirv64-intel -mllvm 
-openmp-ir-builder-use-default-max-threads=false -emit-llvm %s 
-fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-spirv-host.bc -o - 
| FileCheck -check-prefixes=CHECK,CHECK-SPIRV-FLAG %s
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -28,11 +30,11 @@ void foo(int N) {
 
 #endif
 
-// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void 
@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l12({{.*}}) #[[ATTR1:.+]] {
-// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void 
@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l15({{.*}}) #[[ATTR2:.+]] {
-// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void 
@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l18({{.*}}) #[[ATTR3:.+]] {
-// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void 
@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l21({{.*}}) #[[ATTR4:.+]] {
-// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void 
@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l24({{.*}}) #[[ATTR5:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void 
@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l14({{.*}}) #[[ATTR1:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void 
@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l17({{.*}}) #[[ATTR2:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void 
@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l20({{.*}}) #[[ATTR3:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void 
@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l23({{.*}}) #[[ATTR4:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void 
@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l26({{.*}}) #[[ATTR5:.+]] {
 
 // CHECK-AMDGPU: attributes #[[ATTR1]] = { {{.*}} 
"amdgpu-flat-work-group-size"="1,256" {{.*}} }
 // CHECK-AMDGPU: attributes #[[ATTR2]] = { {{.*}} 
"amdgpu-flat-work-group-size"="1,4" {{.*}} }
@@ -45,3 +47,20 @@ void foo(int N) {
 // CHECK-SPIRV: attributes #[[ATTR3]] = { {{.*}} "omp_target_num_teams"="84" 
"omp_target_thread_limit"="42" {{.*}} }
 // CHECK-SPIRV: attributes #[[ATTR4]] = { {{.*}} "omp_target_num_teams"="84" 
"omp_target_thread_limit"="22" {{.*}} }
 // CHECK-SPIRV: attributes #[[ATTR5]] = { {{.*}} "omp_target_num_teams"="84" 
"omp_target_thread_limit"="20" {{.*}} }
+
+// CHECK-AMDGPU-FLAG: attributes #[[ATTR1]] = {
+// CHECK-AMDGPU-FLAG-NOT: amdgpu-flat-work-group-size
+// CHECK-AMDGPU-FLAG-NOT: omp_target_thread_limit
+// CHECK-AMDGPU-FLAG-SAME: }
+// CHECK-AMDGPU-FLAG: attributes #[[ATTR2]] = { {{.*}} 
"amdgpu-flat-work-group-size"="1,4" {{.*}} }
+// CHECK-AMDGPU-FLAG: attributes #[[ATTR3]] = { {{.*}} 
"amdgpu-flat-work-group-size"="1,42" {{.*}} }
+// CHECK-AMDGPU-FLAG: attributes #[[ATTR4]] = { {{.*}} 
"amdgpu-flat-work-group-size"="1,22" {{.*}} }
+// CHECK-AMDGPU-FLAG: attributes #[[ATTR5]] = { {{.*}} 
"amdgpu-flat-work-group-size"="1,20" "amdgpu-max-num-workgroups"="86,1,1" 
{{.*}} }
+
+// CHECK-SPIRV-FLAG: attributes #[[ATTR1]] = {
+// CHECK-SPIRV-FLAG-NOT: omp_target_thread_limit
+// CHECK-SPIRV-FLAG-SAME: }
+// CHECK-SPIRV-FLAG: attributes #[[ATTR2]] = { {{.*}} 
"omp_target_thread_limit"="4"  {{.*}} }
+// CHECK-SPIRV-FLAG: attributes #[[ATTR3]] = { {{.*}} 
"omp_target_num_teams"="84" "omp_target_thread_limit"="42" {{.*}} }
+// CHECK-SPIRV-FLAG: attributes #[[ATTR4]] = { {{.*}} 
"omp_target_num_teams"="84" "omp_target_thread_limit"="22" {{.*}} }
+// CHECK-SPIRV-FLAG: attributes #[[ATTR5]] = { {{.*}} 
"omp_target_num_teams"="84" "omp_target_thread_limit"="20" {{.*}} }
diff --git a/clang/test/OpenMP/thread_limit_nvptx.c 
b/clang/test/OpenMP/thread_limit_nvptx.c
index ffa6c453067d1..c4a83ca052c95 100644
--- a/clang/test/OpenMP/thread_limit_nvptx.c
+++ b/clang/test/OpenMP/thread_limit_nvptx.c
@@ -1,27 +1,28 @@
 // Test target codegen - host bc file has to be created first.
 //
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown 
-fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown 
-fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device 
-fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown 
-fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device 
-fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s 
-check-prefixes=CHECK,CHECK-NVPTX
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown 
-fopenmp-targets=nvptx-nvidia-cuda -mllvm 
-openmp-ir-builder-use-default-max-threads=false -emit-llvm %s 
-fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | 
FileCheck -check-prefix=CHECK,CHECK-NVPTX-FLAG %s
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
 
 void foo(int N) {
-// CHECK: define {{.*}}l11{{.*}} #[[ATTR0:[0-9]+]]
+// CHECK: define {{.*}}l12{{.*}} #[[ATTR0:[0-9]+]]
 #pragma omp target teams distribute parallel for simd
   for (int i = 0; i < N; ++i)
     ;
-// CHECK: define {{.*}}l15{{.*}} #[[ATTR1:[0-9]+]]
+// CHECK: define {{.*}}l16{{.*}} #[[ATTR1:[0-9]+]]
 #pragma omp target teams distribute parallel for simd thread_limit(4)
   for (int i = 0; i < N; ++i)
     ;
 
-// CHECK: define {{.*}}l20{{.*}} #[[ATTR2:[0-9]+]]
+// CHECK: define {{.*}}l21{{.*}} #[[ATTR2:[0-9]+]]
 #pragma omp target teams distribute parallel for simd 
ompx_attribute(__attribute__((launch_bounds(42, 42))))
   for (int i = 0; i < N; ++i)
     ;
 
-// CHECK: define {{.*}}l25{{.*}} #[[ATTR3:[0-9]+]]
+// CHECK: define {{.*}}l26{{.*}} #[[ATTR3:[0-9]+]]
 #pragma omp target teams distribute parallel for simd 
ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
   for (int i = 0; i < N; ++i)
     ;
@@ -29,7 +30,14 @@ void foo(int N) {
 
 #endif
 
-// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="128" {{.*}}}
-// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}}
-// CHECK: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}}
-// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}}
+// CHECK-NVPTX: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="128" {{.*}}}
+// CHECK-NVPTX: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}}
+// CHECK-NVPTX: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}}
+// CHECK-NVPTX: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}}
+
+// CHECK-NVPTX-FLAG: attributes #[[ATTR0]] = {
+// CHECK-NVPTX-FLAG-NOT: nvvm.maxntid
+// CHECK-NVPTX-FLAG-SAME: }
+// CHECK-NVPTX-FLAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}}
+// CHECK-NVPTX-FLAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" 
{{.*}}}
+// CHECK-NVPTX-FLAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" 
{{.*}}}
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp 
b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 06026582538a2..57dc682838317 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -83,6 +83,10 @@ static cl::opt<double> UnrollThresholdFactor(
              "simplifications still taking place"),
     cl::init(1.5));
 
+static cl::opt<bool> UseDefaultMaxThreads(
+    "openmp-ir-builder-use-default-max-threads", cl::Hidden,
+    cl::desc("Use a default max threads if none is provided."), 
cl::init(true));
+
 #ifndef NDEBUG
 /// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
 /// at position IP1 may change the meaning of IP2 or vice-versa. This is 
because
@@ -8155,10 +8159,10 @@ OpenMPIRBuilder::InsertPointTy 
OpenMPIRBuilder::createTargetInit(
   if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
     writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
 
-  // If MaxThreads not set, select the maximum between the default workgroup
-  // size and the MinThreads value.
+  // If MaxThreads is not set and needs adjustment, select the maximum between
+  // the default workgroup size and the MinThreads value.
   int32_t MaxThreadsVal = Attrs.MaxThreads.front();
-  if (MaxThreadsVal < 0) {
+  if (MaxThreadsVal < 0 && UseDefaultMaxThreads) {
     if (hasGridValue(T)) {
       MaxThreadsVal =
           std::max(int32_t(getGridValue(T, Kernel).GV_Default_WG_Size),

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to