https://github.com/DenisGZM updated https://github.com/llvm/llvm-project/pull/99646
>From accdc9bee5a6f2ee2add330dd8d06d280cd13b64 Mon Sep 17 00:00:00 2001 From: Denis Gerasimov <denis.gerasi...@baikalelectronics.ru> Date: Fri, 19 Jul 2024 15:47:57 +0300 Subject: [PATCH 1/8] [NVPTX] Support __usAtomicCAS builtin --- clang/include/clang/Basic/BuiltinsNVPTX.def | 3 +++ clang/lib/CodeGen/CGBuiltin.cpp | 3 +++ clang/lib/Headers/__clang_cuda_device_functions.h | 12 ++++++++++++ clang/test/CodeGen/builtins-nvptx.c | 3 +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 15 +++++++++++++++ 6 files changed, 37 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 504314d8d96e91..3cf683a2f21298 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -829,6 +829,9 @@ BUILTIN(__nvvm_atom_xor_gen_ll, "LLiLLiD*LLi", "n") TARGET_BUILTIN(__nvvm_atom_cta_xor_gen_ll, "LLiLLiD*LLi", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_xor_gen_ll, "LLiLLiD*LLi", "n", SM_60) +TARGET_BUILTIN(__nvvm_atom_cas_gen_us, "UsUsD*UsUs", "n", SM_70) +TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_us, "UsUsD*UsUs", "n", SM_70) +TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_us, "UsUsD*UsUs", "n", SM_70) BUILTIN(__nvvm_atom_cas_gen_i, "iiD*ii", "n") TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_i, "iiD*ii", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_i, "iiD*ii", "n", SM_60) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index f424ddaa175400..769b3ce1886baf 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -20357,6 +20357,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_atom_min_gen_ull: return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E); + case NVPTX::BI__nvvm_atom_cas_gen_us: case NVPTX::BI__nvvm_atom_cas_gen_i: case NVPTX::BI__nvvm_atom_cas_gen_l: case NVPTX::BI__nvvm_atom_cas_gen_ll: @@ -20548,6 +20549,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_atom_sys_xor_gen_l: case NVPTX::BI__nvvm_atom_sys_xor_gen_ll: return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E); + case NVPTX::BI__nvvm_atom_cta_cas_gen_us: case NVPTX::BI__nvvm_atom_cta_cas_gen_i: case NVPTX::BI__nvvm_atom_cta_cas_gen_l: case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: { @@ -20559,6 +20561,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, Intrinsic::nvvm_atomic_cas_gen_i_cta, {ElemTy, Ptr->getType()}), {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))}); } + case NVPTX::BI__nvvm_atom_sys_cas_gen_us: case NVPTX::BI__nvvm_atom_sys_cas_gen_i: case NVPTX::BI__nvvm_atom_sys_cas_gen_l: case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: { diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h index f8a12cefdb81b4..f66fe625a39676 100644 --- a/clang/lib/Headers/__clang_cuda_device_functions.h +++ b/clang/lib/Headers/__clang_cuda_device_functions.h @@ -529,6 +529,18 @@ __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); } __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); }; __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); }; __DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); } +__DEVICE__ unsigned short __usAtomicCAS(unsigned short *__p, unsigned short __cmp, + unsigned short __v) { + return __nvvm_atom_cas_gen_us(__p, __cmp, __v); +} +__DEVICE__ unsigned short __usAtomicCAS_block(unsigned short *__p, unsigned short __cmp, + unsigned short __v) { + return __nvvm_atom_cta_cas_gen_us(__p, __cmp, __v); +} +__DEVICE__ unsigned short __usAtomicCAS_system(unsigned short *__p, unsigned short __cmp, + unsigned short __v) { + return __nvvm_atom_sys_cas_gen_us(__p, __cmp, __v); +} __DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) { return __nvvm_atom_add_gen_i((int *)__p, __v); } diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 75b9d6d1fe1902..3ba1fabd05335e 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -306,6 +306,9 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip, // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 8 __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll); + // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2 + // CHECK-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0 + __nvvm_atom_cas_gen_us(ip, 0, i); // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 4 // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0 __nvvm_atom_cas_gen_i(ip, 0, i); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 577141299b9496..7847a8f838c62f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -872,7 +872,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // actions computeRegisterProperties(STI.getRegisterInfo()); - setMinCmpXchgSizeInBits(32); + setMinCmpXchgSizeInBits(16); setMaxAtomicSizeInBitsSupported(64); setMaxDivRemBitWidthSupported(64); } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 887951b55fb3b7..6cc3fad55b734d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2039,6 +2039,12 @@ defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64 // atom_cas +def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>; def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>; def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), @@ -2052,6 +2058,14 @@ def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>; +defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas", + atomic_cmp_swap_i16_g, i16imm>; +defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, ".shared", ".b16", ".cas", + atomic_cmp_swap_i16_s, i16imm>; +defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", ".b16", ".cas", + atomic_cmp_swap_i16_gen, i16imm>; +defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas", + atomic_cmp_swap_i16_gen, i16imm>; defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas", atomic_cmp_swap_i32_g, i32imm>; defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas", @@ -2263,6 +2277,7 @@ multiclass ATOM2_incdec_impl<string OpStr> { // atom.cas multiclass ATOM3_cas_impl<string OpStr> { + defm _b16 : ATOM3S_impl<OpStr, "i", "b16", i16, Int16Regs, i16imm, imm, i16, []>; defm _b32 : ATOM3S_impl<OpStr, "i", "b32", i32, Int32Regs, i32imm, imm, i32, []>; defm _b64 : ATOM3S_impl<OpStr, "i", "b64", i64, Int64Regs, i64imm, imm, i64, []>; } >From 3defc83dc25921a56bfb99874616ac17e9eed291 Mon Sep 17 00:00:00 2001 From: Denis Gerasimov <denis.gerasi...@baikalelectronics.ru> Date: Fri, 19 Jul 2024 18:26:41 +0300 Subject: [PATCH 2/8] Consider SM and PTX versions. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 3 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 8 +- llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 140 +++++++------------- 3 files changed, 56 insertions(+), 95 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7847a8f838c62f..81d6b79b291cc5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -872,7 +872,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // actions computeRegisterProperties(STI.getRegisterInfo()); - setMinCmpXchgSizeInBits(16); + bool Allow16BitCAS = STI.getSmVersion() >= 70 && STI.getPTXVersion() >= 63; + setMinCmpXchgSizeInBits(Allow16BitCAS ? 16 : 32); setMaxAtomicSizeInBitsSupported(64); setMaxDivRemBitWidthSupported(64); } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 6cc3fad55b734d..3f6b4fc80fe8bd 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2059,13 +2059,13 @@ def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>; defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas", - atomic_cmp_swap_i16_g, i16imm>; + atomic_cmp_swap_i16_g, i16imm, [hasSM<70>, hasPTX<63>]>; defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, ".shared", ".b16", ".cas", - atomic_cmp_swap_i16_s, i16imm>; + atomic_cmp_swap_i16_s, i16imm, [hasSM<70>, hasPTX<63>]>; defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", ".b16", ".cas", - atomic_cmp_swap_i16_gen, i16imm>; + atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>; defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas", - atomic_cmp_swap_i16_gen, i16imm>; + atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>; defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas", atomic_cmp_swap_i32_g, i32imm>; defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas", diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index 9301ea44c69367..d69dd8ad1c9405 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -45,102 +45,62 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; ; CHECKPTX71-LABEL: test( ; CHECKPTX71: { -; CHECKPTX71-NEXT: .reg .pred %p<5>; -; CHECKPTX71-NEXT: .reg .b16 %rs<18>; -; CHECKPTX71-NEXT: .reg .b32 %r<58>; -; CHECKPTX71-NEXT: .reg .f32 %f<12>; +; CHECKPTX71-NEXT: .reg .pred %p<5>; +; CHECKPTX71-NEXT: .reg .b16 %rs<34>; +; CHECKPTX71-NEXT: .reg .b32 %r<4>; +; CHECKPTX71-NEXT: .reg .f32 %f<12>; ; CHECKPTX71-EMPTY: ; CHECKPTX71-NEXT: // %bb.0: -; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3]; -; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2]; -; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1]; -; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0]; -; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4; -; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3; -; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3; -; CHECKPTX71-NEXT: mov.b32 %r26, 65535; -; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2; -; CHECKPTX71-NEXT: not.b32 %r3, %r27; -; CHECKPTX71-NEXT: ld.u32 %r54, [%r1]; -; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs1; -; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start +; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3]; +; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2]; +; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1]; +; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0]; +; CHECKPTX71-NEXT: ld.b16 %rs30, [%r1]; +; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs13; +; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2; -; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28; -; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs2; -; CHECKPTX71-NEXT: add.rn.f32 %f3, %f1, %f2; -; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs4, %f3; -; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4; -; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2; -; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3; -; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32; -; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54; -; CHECKPTX71-NEXT: mov.u32 %r54, %r6; -; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; -; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end -; CHECKPTX71-NEXT: ld.u32 %r55, [%r1]; -; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start9 +; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs30; +; CHECKPTX71-NEXT: add.rn.f32 %f3, %f2, %f1; +; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs14, %f3; +; CHECKPTX71-NEXT: atom.cas.b16 %rs17, [%r1], %rs30, %rs14; +; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs17, %rs30; +; CHECKPTX71-NEXT: mov.u16 %rs30, %rs17; +; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; +; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end +; CHECKPTX71-NEXT: ld.b16 %rs31, [%r1]; +; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start2 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2; -; CHECKPTX71-NEXT: cvt.u16.u32 %rs6, %r33; -; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs6; -; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; -; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs8, %f5; -; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs8; -; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2; -; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3; -; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37; -; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55; -; CHECKPTX71-NEXT: mov.u32 %r55, %r9; -; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; -; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end8 -; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4; -; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3; -; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24; -; CHECKPTX71-NEXT: shl.b32 %r40, %r26, %r11; -; CHECKPTX71-NEXT: not.b32 %r12, %r40; -; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10]; -; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start27 +; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs31; +; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs18, %f5; +; CHECKPTX71-NEXT: atom.cas.b16 %rs21, [%r1], %rs31, %rs18; +; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs21, %rs31; +; CHECKPTX71-NEXT: mov.u16 %rs31, %rs21; +; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; +; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end1 +; CHECKPTX71-NEXT: ld.global.b16 %rs32, [%r2]; +; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start8 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11; -; CHECKPTX71-NEXT: cvt.u16.u32 %rs10, %r41; -; CHECKPTX71-NEXT: cvt.f32.bf16 %f6, %rs10; -; CHECKPTX71-NEXT: add.rn.f32 %f8, %f6, %f2; -; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs12, %f8; -; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs12; -; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; -; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12; -; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45; -; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56; -; CHECKPTX71-NEXT: mov.u32 %r56, %r15; -; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; -; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end26 -; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4; -; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3; -; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24; -; CHECKPTX71-NEXT: shl.b32 %r48, %r26, %r17; -; CHECKPTX71-NEXT: not.b32 %r18, %r48; -; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16]; -; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start45 +; CHECKPTX71-NEXT: cvt.f32.bf16 %f7, %rs32; +; CHECKPTX71-NEXT: add.rn.f32 %f8, %f7, %f1; +; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs22, %f8; +; CHECKPTX71-NEXT: atom.global.cas.b16 %rs25, [%r2], %rs32, %rs22; +; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs25, %rs32; +; CHECKPTX71-NEXT: mov.u16 %rs32, %rs25; +; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; +; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end7 +; CHECKPTX71-NEXT: ld.shared.b16 %rs33, [%r3]; +; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start14 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17; -; CHECKPTX71-NEXT: cvt.u16.u32 %rs14, %r49; -; CHECKPTX71-NEXT: cvt.f32.bf16 %f9, %rs14; -; CHECKPTX71-NEXT: add.rn.f32 %f11, %f9, %f2; -; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs16, %f11; -; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs16; -; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17; -; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18; -; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53; -; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57; -; CHECKPTX71-NEXT: mov.u32 %r57, %r21; -; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; -; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end44 -; CHECKPTX71-NEXT: ret; +; CHECKPTX71-NEXT: cvt.f32.bf16 %f10, %rs33; +; CHECKPTX71-NEXT: add.rn.f32 %f11, %f10, %f1; +; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs26, %f11; +; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs29, [%r3], %rs33, %rs26; +; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs29, %rs33; +; CHECKPTX71-NEXT: mov.u16 %rs33, %rs29; +; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; +; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end13 +; CHECKPTX71-NEXT: ret; %r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst >From ecea3f0294d5efd879f4adcb6b5135e523559d99 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi <gonza...@nvidia.com> Date: Fri, 19 Jul 2024 09:20:58 -0700 Subject: [PATCH 3/8] [NVPTX] Add cas.b16 individual tests --- llvm/test/CodeGen/NVPTX/cmpxchg.ll | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg.ll diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll new file mode 100644 index 00000000000000..dd208aacb87142 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_32 | FileCheck %s --check-prefixes=SM30,CHECK +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_32 | %ptxas-verify %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=SM70,CHECK +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} + +; TODO: these are system scope, but are compiled to gpu scope.. +; TODO: these are seq_cst, but are compiled to relaxed.. + +; CHECK-LABEL: relaxed_sys_i8 +define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { + ; SM30: atom.cas.b32 + ; SM70: atom.cas.b16 + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +; CHECK-LABEL: relaxed_sys_i16 +define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { + ; SM30: atom.cas.b32 + ; SM70: atom.cas.b16 + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +; CHECK-LABEL: relaxed_sys_i32 +define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { + ; CHECK: atom.cas.b32 + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +; CHECK-LABEL: relaxed_sys_i64 +define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { + ; CHECK: atom.cas.b64 + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} >From 28cbfbe6a7d037a0ab9f3dbe98d77b3a8118136e Mon Sep 17 00:00:00 2001 From: Denis Gerasimov <denis.gerasi...@baikalelectronics.ru> Date: Fri, 19 Jul 2024 19:23:19 +0300 Subject: [PATCH 4/8] Added hasAtomCas16 feature --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 3 +-- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 81d6b79b291cc5..0ff5b14eb0d7e8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -872,8 +872,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // actions computeRegisterProperties(STI.getRegisterInfo()); - bool Allow16BitCAS = STI.getSmVersion() >= 70 && STI.getPTXVersion() >= 63; - setMinCmpXchgSizeInBits(Allow16BitCAS ? 16 : 32); + setMinCmpXchgSizeInBits(STI.hasAtomCas16() ? 16 : 32); setMaxAtomicSizeInBitsSupported(64); setMaxDivRemBitWidthSupported(64); } diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index e47050734aae1e..0591782e8148b9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -77,6 +77,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasAtomScope() const { return SmVersion >= 60; } bool hasAtomBitwise64() const { return SmVersion >= 32; } bool hasAtomMinMax64() const { return SmVersion >= 32; } + bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; } bool hasLDG() const { return SmVersion >= 32; } bool hasHWROT32() const { return SmVersion >= 32; } bool hasImageHandles() const; >From 31fe45f018b36d087d886cd94abea15c1c95fabb Mon Sep 17 00:00:00 2001 From: Denis Gerasimov <denis.gerasi...@baikalelectronics.ru> Date: Fri, 19 Jul 2024 23:36:51 +0300 Subject: [PATCH 5/8] Fixed builtin test --- clang/test/CodeGen/builtins-nvptx.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 3ba1fabd05335e..dee68157b2b091 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -1,4 +1,7 @@ // REQUIRES: nvptx-registered-target +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \ +// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s @@ -235,7 +238,8 @@ __shared__ long long sll; // Check for atomic intrinsics // CHECK-LABEL: nvvm_atom -__device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip, +__device__ void nvvm_atom(float *fp, float f, double *dfp, double df, + unsigned short *usp, unsigned short us, int *ip, int i, unsigned int *uip, unsigned ui, long *lp, long l, long long *llp, long long ll) { // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4 @@ -306,9 +310,6 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip, // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 8 __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll); - // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2 - // CHECK-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0 - __nvvm_atom_cas_gen_us(ip, 0, i); // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 4 // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0 __nvvm_atom_cas_gen_i(ip, 0, i); @@ -577,6 +578,12 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip, __nvvm_atom_sys_cas_gen_ll(&sll, ll, 0); #endif +#if __CUDA_ARCH__ >= 700 + // CHECK_PTX63_SM70: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2 + // CHECK_PTX63_SM70-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0 + __nvvm_atom_cas_gen_us(usp, 0, us); +#endif + // CHECK: ret } >From 1fb6f21088f5b90bc385b7f2b393d04295b7ad6b Mon Sep 17 00:00:00 2001 From: "Denis.Gerasimov" <deng...@gmail.com> Date: Fri, 2 Aug 2024 13:02:00 +0300 Subject: [PATCH 6/8] cta and sys intrinsics test --- clang/test/CodeGen/builtins-nvptx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index dee68157b2b091..4673c0c7eee014 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -582,6 +582,10 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, // CHECK_PTX63_SM70: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2 // CHECK_PTX63_SM70-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0 __nvvm_atom_cas_gen_us(usp, 0, us); + // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.cta.i16.p0 + __nvvm_atom_cta_cas_gen_us(usp, 0, us); + // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.sys.i16.p0 + __nvvm_atom_sys_cas_gen_us(usp, 0, us); #endif // CHECK: ret >From c18b58183b0b19fab98c9bcf0e5c7af7966cf2d4 Mon Sep 17 00:00:00 2001 From: "Denis.Gerasimov" <deng...@gmail.com> Date: Sun, 18 Aug 2024 14:19:41 +0300 Subject: [PATCH 7/8] Autogenerated test for cmpxchg --- llvm/test/CodeGen/NVPTX/cmpxchg.ll | 191 +++++++++++++++++++++++++- llvm/utils/UpdateTestChecks/common.py | 1 + 2 files changed, 186 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index dd208aacb87142..85ae5f0c8f6013 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march nvptx64 --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_32 | FileCheck %s --check-prefixes=SM30,CHECK ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_32 | %ptxas-verify %} ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=SM70,CHECK @@ -8,30 +9,208 @@ ; CHECK-LABEL: relaxed_sys_i8 define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { - ; SM30: atom.cas.b32 - ; SM70: atom.cas.b16 +; SM30-LABEL: relaxed_sys_i8( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r17, %r20, %r3; +; SM30-NEXT: or.b32 %r18, %r20, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM30-NEXT: @%p1 bra $L__BB0_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM30-NEXT: mov.u32 %r20, %r8; +; SM30-NEXT: @%p2 bra $L__BB0_1; +; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM30-NEXT: st.param.b32 [func_retval0+0], %r13; +; SM30-NEXT: ret; +; +; SM70-LABEL: relaxed_sys_i8( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<17>; +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs9, [relaxed_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -2; +; SM70-NEXT: ld.param.u8 %rs10, [relaxed_sys_i8_param_1]; +; SM70-NEXT: and.b64 %rd3, %rd2, 1; +; SM70-NEXT: shl.b64 %rd4, %rd3, 3; +; SM70-NEXT: cvt.u32.u64 %r1, %rd4; +; SM70-NEXT: mov.u16 %rs11, 255; +; SM70-NEXT: shl.b16 %rs12, %rs11, %r1; +; SM70-NEXT: not.b16 %rs2, %rs12; +; SM70-NEXT: shl.b16 %rs3, %rs9, %r1; +; SM70-NEXT: shl.b16 %rs4, %rs10, %r1; +; SM70-NEXT: ld.u16 %rs13, [%rd1]; +; SM70-NEXT: and.b16 %rs16, %rs13, %rs2; +; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b16 %rs14, %rs16, %rs3; +; SM70-NEXT: or.b16 %rs15, %rs16, %rs4; +; SM70-NEXT: atom.cas.b16 %rs7, [%rd1], %rs15, %rs14; +; SM70-NEXT: setp.eq.s16 %p1, %rs7, %rs15; +; SM70-NEXT: @%p1 bra $L__BB0_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM70-NEXT: and.b16 %rs8, %rs7, %rs2; +; SM70-NEXT: setp.ne.s16 %p2, %rs16, %rs8; +; SM70-NEXT: mov.u16 %rs16, %rs8; +; SM70-NEXT: @%p2 bra $L__BB0_1; +; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM70-NEXT: cvt.u32.u16 %r2, %rs9; +; SM70-NEXT: st.param.b32 [func_retval0+0], %r2; +; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new } ; CHECK-LABEL: relaxed_sys_i16 define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { - ; SM30: atom.cas.b32 - ; SM70: atom.cas.b16 +; SM30-LABEL: relaxed_sys_i16( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 65535; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; +; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r16, %r19, %r3; +; SM30-NEXT: or.b32 %r17, %r19, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM30-NEXT: @%p1 bra $L__BB1_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM30-NEXT: mov.u32 %r19, %r8; +; SM30-NEXT: @%p2 bra $L__BB1_1; +; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM30-NEXT: st.param.b32 [func_retval0+0], %r14; +; SM30-NEXT: ret; +; +; SM70-LABEL: relaxed_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<4>; +; SM70-NEXT: .reg .b32 %r<2>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i16_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_1]; +; SM70-NEXT: ld.param.u16 %rs2, [relaxed_sys_i16_param_2]; +; SM70-NEXT: atom.cas.b16 %rs3, [%rd1], %rs1, %rs2; +; SM70-NEXT: cvt.u32.u16 %r1, %rs2; +; SM70-NEXT: st.param.b32 [func_retval0+0], %r1; +; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst ret i16 %new } ; CHECK-LABEL: relaxed_sys_i32 define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { - ; CHECK: atom.cas.b32 +; SM30-LABEL: relaxed_sys_i32( +; SM30: { +; SM30-NEXT: .reg .b32 %r<4>; +; SM30-NEXT: .reg .b64 %rd<2>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; +; SM30-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; +; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM30-NEXT: st.param.b32 [func_retval0+0], %r2; +; SM30-NEXT: ret; +; +; SM70-LABEL: relaxed_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; +; SM70-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0+0], %r2; +; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst ret i32 %new } ; CHECK-LABEL: relaxed_sys_i64 define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { - ; CHECK: atom.cas.b64 +; SM30-LABEL: relaxed_sys_i64( +; SM30: { +; SM30-NEXT: .reg .b64 %rd<5>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; +; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; +; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM30-NEXT: st.param.b64 [func_retval0+0], %rd3; +; SM30-NEXT: ret; +; +; SM70-LABEL: relaxed_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; +; SM70-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0+0], %rd3; +; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index eb212ed304e9db..c5e4ad4219c91d 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -636,6 +636,7 @@ def get_triple_from_march(march): "amdgcn": "amdgcn", "r600": "r600", "mips": "mips", + "nvptx64": "nvptx64", "sparc": "sparc", "hexagon": "hexagon", "ve": "ve", >From cdf1fa321a68fde6cec4d6fc384a1f78f984b521 Mon Sep 17 00:00:00 2001 From: "Denis.Gerasimov" <deng...@gmail.com> Date: Tue, 20 Aug 2024 13:27:38 +0300 Subject: [PATCH 8/8] Clang formater style --- clang/lib/Headers/__clang_cuda_device_functions.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h index f66fe625a39676..86123727a1bc3f 100644 --- a/clang/lib/Headers/__clang_cuda_device_functions.h +++ b/clang/lib/Headers/__clang_cuda_device_functions.h @@ -529,15 +529,17 @@ __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); } __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); }; __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); }; __DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); } -__DEVICE__ unsigned short __usAtomicCAS(unsigned short *__p, unsigned short __cmp, - unsigned short __v) { +__DEVICE__ unsigned short +__usAtomicCAS(unsigned short *__p, unsigned short __cmp, unsigned short __v) { return __nvvm_atom_cas_gen_us(__p, __cmp, __v); } -__DEVICE__ unsigned short __usAtomicCAS_block(unsigned short *__p, unsigned short __cmp, +__DEVICE__ unsigned short __usAtomicCAS_block(unsigned short *__p, + unsigned short __cmp, unsigned short __v) { return __nvvm_atom_cta_cas_gen_us(__p, __cmp, __v); } -__DEVICE__ unsigned short __usAtomicCAS_system(unsigned short *__p, unsigned short __cmp, +__DEVICE__ unsigned short __usAtomicCAS_system(unsigned short *__p, + unsigned short __cmp, unsigned short __v) { return __nvvm_atom_sys_cas_gen_us(__p, __cmp, __v); } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits