[clang] [llvm] [mlir] [openmp] [OpenMP][offload] Cross-team reductions with variable number of teams (PR #195102)

Robert Imschweiler via cfe-commits Tue, 02 Jun 2026 00:46:54 -0700

================
@@ -173,144 +201,99 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy 
*Loc,
   return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
 }
 
+// Reduction across teams on the GPU.
+//
+// Parameters:
+// - Loc: Location of the reduction
+// - reduce_data: Pointer to the reduction data
+// - shflFct:  Shuffle reduction function
+// - cpyFct:   Inter-warp copy function (copies data from each warp's thread 0
+//             to the lanes of the zeroth warp)
+// - lgcpyFct: List-global copy function (copies the reduction data from the
+//             local thread to the global buffer)
+// - glcpyFct: Global copy function (copies the reduction data from the global
+//             buffer to the local thread)
+// - glredFct: Global reduce function (reduces the reduction data from the
+//             global buffer to the local thread)
+//
+// Returns:
+// - 1 if this thread must write the final reduced value back to the shared
+//   reduction variable (i.e. thread 0 of the single team when NumTeams == 1,
+//   or thread 0 of the last team to finish its partial reduction otherwise).
+// - 0 otherwise.
+//
 [[clang::always_inline]]
-int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
-    IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
-    uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
-    InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
-    ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
-  // Terminate all threads in non-SPMD mode except for the master thread.
-  uint32_t ThreadId = mapping::getThreadIdInBlock();
-  if (mapping::isGenericMode()) {
+int32_t __kmpc_gpu_xteam_reduce_nowait(IdentTy *Loc, void *reduce_data,
+                                       ShuffleReductFnTy shflFct,
+                                       InterWarpCopyFnTy cpyFct,
+                                       ListGlobalFnTy lgcpyFct,
+                                       ListGlobalFnTy glcpyFct,
+                                       ListGlobalFnTy glredFct) {
+  uint32_t ThreadId;
+  uint32_t NumThreads;
+
+  if (mapping::isSPMDMode()) {
+    // In SPMD mode all workers participate in the teams reduction.
+    ThreadId = mapping::getThreadIdInBlock();
+    NumThreads = mapping::getNumberOfThreadsInBlock();
+  } else {
+    // In generic mode, only the team master participates in the teams
+    // reduction because the workers are waiting for parallel work.
     if (!mapping::isMainThreadInGenericMode())
       return 0;
     ThreadId = 0;
+    NumThreads = 1;
   }
 
-  uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
-  uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
-
-  // In non-generic mode all workers participate in the teams reduction.
-  // In generic mode only the team master participates in the teams
-  // reduction because the workers are waiting for parallel work.
-  uint32_t NumThreads = omp_get_num_threads();
   uint32_t TeamId = omp_get_team_num();
   uint32_t NumTeams = omp_get_num_teams();
-  [[clang::loader_uninitialized]] static Local<unsigned> Bound;
-  [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
-
-  // Block progress for teams greater than the current upper
-  // limit. We always only allow a number of teams less or equal
-  // to the number of slots in the buffer.
-  bool IsMaster = (ThreadId == 0);
-  while (IsMaster) {
-    Bound = atomic::load(&IterCnt, atomic::acquire);
-    if (TeamId < Bound + num_of_records)
-      break;
-  }
 
-  if (IsMaster) {
-    int ModBockId = TeamId % num_of_records;
-    if (TeamId < num_of_records) {
-      lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
-    } else
-      lgredFct(GlobalBuffer, ModBockId, reduce_data);
-
-    // Propagate the memory writes above to the world.
-    fence::kernel(atomic::release);
-
-    // Increment team counter.
-    // This counter is incremented by all teams in the current
-    // num_of_records chunk.
-    ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst,
-                                 atomic::MemScopeTy::device);
+  // Fast path for single-team kernels: no cross-team work required,
+  // the team-local reduction already produced the final result.
+  if (NumTeams <= 1)
+    return ThreadId == 0;
+
+  uint32_t &TeamsDone = state::getKernelLaunchEnvironment().ReductionTeamsDone;
+  void *GlobalBuffer = state::getKernelLaunchEnvironment().ReductionBuffer;
+  [[clang::loader_uninitialized]] static Local<uint32_t> TeamsDoneResult;
+
+  // Save the team's reduced value in the global buffer and atomically
+  // increment the teams-done counter.
+  if (ThreadId == 0) {
+    lgcpyFct(GlobalBuffer, TeamId, reduce_data);
+    // We let the atomic inc wrap around if the value gets larger than
+    // NumTeams-1, which makes the counter self-reset.
+    TeamsDoneResult = atomic::inc(&TeamsDone, NumTeams - 1u, atomic::acq_rel,
+                                  atomic::MemScopeTy::device);
   }
 
-  // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
-  // state machine.
+  // This sync is needed so that all threads from last team see the shared 
teams
+  // done counter value and know that they are in the last team.
   if (mapping::isSPMDMode())
     synchronize::threadsAligned(atomic::acq_rel);
----------------
ro-i wrote:


there is a memory barrier mentioned but it's not explicit on which scope. The 
implementation is __syncthreads, afaics. I think I would feel better having the 
barrier

https://github.com/llvm/llvm-project/pull/195102
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [openmp] [OpenMP][offload] Cross-team reductions with variable number of teams (PR #195102)

Reply via email to