================ @@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { // Shuffles the the lanes inside the warp according to the given index. _DEFAULT_FN_ATTRS static __inline__ uint32_t -__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { +__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, + uint32_t __width) { uint32_t __mask = (uint32_t)__lane_mask; - return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u); + return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, + ((__gpu_num_lanes() - __width) << 8u) | 0x1f); ---------------- Artem-B wrote:
Hmm.. Looks like CUDA SDK implements shfl_sync_idx in their own headers the same way:  OK, I'm officially confused now, but given that it's been implemented this way for about a decade now, I'm fine keeping it as is, until there's concrete evidence that it's broken. In practice it probably means that we can't (and don't) really use non-default values for width on NVIDIA GPUs. https://github.com/llvm/llvm-project/pull/125896 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits