https://github.com/AustinSchuh updated https://github.com/llvm/llvm-project/pull/132881
>From 381ba9f21b4c2a2c4028143b84e9f71c8a20692f Mon Sep 17 00:00:00 2001 From: Austin Schuh <austin.li...@gmail.com> Date: Mon, 24 Mar 2025 21:42:45 -0700 Subject: [PATCH 1/2] cuda clang: Fix argument order for __reduce_max_sync The following cuda kernel would crash with an "an illegal instruction was encountered" message. __global__ void testcode(const float* data, unsigned *max_value) { unsigned r = static_cast<unsigned>(data[threadIdx.x]); const unsigned mask = __ballot_sync(0xFFFFFFFF, true); unsigned mx = __reduce_max_sync(mask, r); atomicMax(max_value, mx); } Digging into the ptx from both nvcc and clang, I discovered that the arguments for the mask and value were swapped. This swaps them back. Fixes: https://github.com/llvm/llvm-project/issues/131415 Signed-off-by: Austin Schuh <austin.li...@gmail.com> --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index b2e05a567b4fe..1943e94c3ee7a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -315,7 +315,7 @@ defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<Int64Regs, "b64", int_nvvm_match_all_s multiclass REDUX_SYNC<string BinOp, string PTXType, Intrinsic Intrin> { def : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask), "redux.sync." # BinOp # "." # PTXType # " $dst, $src, $mask;", - [(set i32:$dst, (Intrin i32:$src, Int32Regs:$mask))]>, + [(set i32:$dst, (Intrin i32:$mask, Int32Regs:$src))]>, Requires<[hasPTX<70>, hasSM<80>]>; } >From aa9148ce6af19a799f37aea6d973087c8fb7c53c Mon Sep 17 00:00:00 2001 From: Austin Schuh <austin.li...@gmail.com> Date: Tue, 25 Mar 2025 12:12:47 -0700 Subject: [PATCH 2/2] Swap in __clang_cuda_intrinsics.h instead --- clang/lib/Headers/__clang_cuda_intrinsics.h | 16 ++++++++-------- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h index a04e8b6de44d0..8b230af6f6647 100644 --- a/clang/lib/Headers/__clang_cuda_intrinsics.h +++ b/clang/lib/Headers/__clang_cuda_intrinsics.h @@ -515,32 +515,32 @@ __device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) { #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 __device__ inline unsigned __reduce_add_sync(unsigned __mask, unsigned __value) { - return __nvvm_redux_sync_add(__mask, __value); + return __nvvm_redux_sync_add(__value, __mask); } __device__ inline unsigned __reduce_min_sync(unsigned __mask, unsigned __value) { - return __nvvm_redux_sync_umin(__mask, __value); + return __nvvm_redux_sync_umin(__value, __mask); } __device__ inline unsigned __reduce_max_sync(unsigned __mask, unsigned __value) { - return __nvvm_redux_sync_umax(__mask, __value); + return __nvvm_redux_sync_umax(__value, __mask); } __device__ inline int __reduce_min_sync(unsigned __mask, int __value) { - return __nvvm_redux_sync_min(__mask, __value); + return __nvvm_redux_sync_min(__value, __mask); } __device__ inline int __reduce_max_sync(unsigned __mask, int __value) { - return __nvvm_redux_sync_max(__mask, __value); + return __nvvm_redux_sync_max(__value, __mask); } __device__ inline unsigned __reduce_or_sync(unsigned __mask, unsigned __value) { - return __nvvm_redux_sync_or(__mask, __value); + return __nvvm_redux_sync_or(__value, __mask); } __device__ inline unsigned __reduce_and_sync(unsigned __mask, unsigned __value) { - return __nvvm_redux_sync_and(__mask, __value); + return __nvvm_redux_sync_and(__value, __mask); } __device__ inline unsigned __reduce_xor_sync(unsigned __mask, unsigned __value) { - return __nvvm_redux_sync_xor(__mask, __value); + return __nvvm_redux_sync_xor(__value, __mask); } __device__ inline void __nv_memcpy_async_shared_global_4(void *__dst, diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 1943e94c3ee7a..b2e05a567b4fe 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -315,7 +315,7 @@ defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<Int64Regs, "b64", int_nvvm_match_all_s multiclass REDUX_SYNC<string BinOp, string PTXType, Intrinsic Intrin> { def : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask), "redux.sync." # BinOp # "." # PTXType # " $dst, $src, $mask;", - [(set i32:$dst, (Intrin i32:$mask, Int32Regs:$src))]>, + [(set i32:$dst, (Intrin i32:$src, Int32Regs:$mask))]>, Requires<[hasPTX<70>, hasSM<80>]>; } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits