Author: Wenju He
Date: 2026-04-13T08:16:06+08:00
New Revision: 7b94b9ae13c69075a85d3af8b987a38610fbce5a

URL: 
https://github.com/llvm/llvm-project/commit/7b94b9ae13c69075a85d3af8b987a38610fbce5a
DIFF: 
https://github.com/llvm/llvm-project/commit/7b94b9ae13c69075a85d3af8b987a38610fbce5a.diff

LOG: [libclc] Refine generic __clc_get_sub_group_size with fast full sub-group 
path (#188895)

Add a fast path for the common case that total work-group size is
multiple of max sub-group size.

The fallback path is ported from amdgpu/workitem/clc_get_sub_group_size.cl.

Compiler can generate predicated instructions for the fallback path to
avoid branches.

Added: 
    

Modified: 
    libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl

Removed: 
    


################################################################################
diff  --git a/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl 
b/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl
index 7944486aac0f0..7f96fc8c31717 100644
--- a/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl
+++ b/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl
@@ -6,21 +6,21 @@
 //
 
//===----------------------------------------------------------------------===//
 
+#include "clc/shared/clc_min.h"
+#include "clc/workitem/clc_get_local_linear_id.h"
 #include "clc/workitem/clc_get_local_size.h"
 #include "clc/workitem/clc_get_max_sub_group_size.h"
-#include "clc/workitem/clc_get_num_sub_groups.h"
-#include "clc/workitem/clc_get_sub_group_id.h"
 #include "clc/workitem/clc_get_sub_group_size.h"
 
 _CLC_OVERLOAD _CLC_DEF uint __clc_get_sub_group_size() {
-  if (__clc_get_sub_group_id() != __clc_get_num_sub_groups() - 1) {
-    return __clc_get_max_sub_group_size();
-  }
-  size_t size_x = __clc_get_local_size(0);
-  size_t size_y = __clc_get_local_size(1);
-  size_t size_z = __clc_get_local_size(2);
-  size_t linear_size = size_z * size_y * size_x;
-  size_t uniform_groups = __clc_get_num_sub_groups() - 1;
-  size_t uniform_size = __clc_get_max_sub_group_size() * uniform_groups;
-  return linear_size - uniform_size;
+  uint local_linear_size = (uint)__clc_get_local_size(0) *
+                           (uint)__clc_get_local_size(1) *
+                           (uint)__clc_get_local_size(2);
+  uint max_sg_size = __clc_get_max_sub_group_size();
+  // Assume max_sg_size is power of 2.
+  uint remainder = local_linear_size & (max_sg_size - 1);
+  if (remainder == 0)
+    return max_sg_size;
+  uint lid = (uint)__clc_get_local_linear_id();
+  return __clc_min(max_sg_size, local_linear_size - (lid & ~(max_sg_size - 
1)));
 }


        
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to