================ @@ -0,0 +1,258 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_90 -march=nvptx64 -mattr=+ptx80 | FileCheck %s +; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +@llvm.used = appending global [1 x ptr] [ptr @test_distributed_shared_cluster], section "llvm.metadata" + +declare ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3), i32) +declare i1 @llvm.nvvm.isspacep.shared.cluster(ptr) +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +declare ptr @llvm.nvvm.mapa(ptr, i32) + +define i32 @test_distributed_shared_cluster(ptr %ptr, ptr addrspace(3) %smem_ptr) local_unnamed_addr { +; CHECK-LABEL: test_distributed_shared_cluster( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<13>; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<69>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-NEXT: .reg .b64 %rd<24>; +; CHECK-NEXT: .reg .f64 %fd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.u64 %rd2, [test_distributed_shared_cluster_param_0]; +; CHECK-NEXT: ld.param.u64 %rd3, [test_distributed_shared_cluster_param_1]; +; CHECK-NEXT: mov.u32 %r24, %ctaid.x; +; CHECK-NEXT: xor.b32 %r25, %r24, 1; +; CHECK-NEXT: isspacep.shared::cluster %p1, %rd2; +; CHECK-NEXT: mapa.u64 %rd4, %rd2, %r25; +; CHECK-NEXT: isspacep.shared::cluster %p2, %rd4; +; CHECK-NEXT: mapa.shared::cluster.u64 %rd5, %rd3, %r25; +; CHECK-NEXT: mov.b16 %rs1, 0x3C00; +; CHECK-NEXT: atom.shared::cluster.add.noftz.f16 %rs2, [%rd5], %rs1; +; CHECK-NEXT: mov.b16 %rs3, 0x3F80; +; CHECK-NEXT: atom.shared::cluster.add.noftz.bf16 %rs4, [%rd5], %rs3; +; CHECK-NEXT: atom.shared::cluster.add.f32 %f1, [%rd5], 0f3F800000; +; CHECK-NEXT: atom.shared::cluster.add.f64 %fd1, [%rd5], 0d3FF0000000000000; +; CHECK-NEXT: atom.shared::cluster.add.u32 %r26, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.add.u64 %rd6, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.exch.b32 %r27, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.exch.b64 %rd7, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.min.s32 %r28, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.min.s64 %rd8, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.min.u32 %r29, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.min.u64 %rd9, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.max.s32 %r30, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.max.s64 %rd10, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.max.u32 %r31, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.max.u64 %rd11, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.inc.u32 %r32, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.dec.u32 %r33, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.and.b32 %r34, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.and.b64 %rd12, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.or.b32 %r35, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.or.b64 %rd13, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.xor.b32 %r36, [%rd5], 1; +; CHECK-NEXT: atom.shared::cluster.xor.b64 %rd14, [%rd5], 1; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r37, [%rd5], 1, 0; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r38, [%rd5], 1, 0; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r39, [%rd5], 1, 0; +; CHECK-NEXT: atom.release.shared::cluster.cas.b32 %r40, [%rd5], 1, 0; +; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r41, [%rd5], 1, 0; +; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r42, [%rd5], 1, 0; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r43, [%rd5], 1, 0; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r44, [%rd5], 1, 0; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r45, [%rd5], 1, 0; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b64 %rd15, [%rd5], 1, 0; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd16, [%rd5], 1, 0; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd17, [%rd5], 1, 0; +; CHECK-NEXT: atom.release.shared::cluster.cas.b64 %rd18, [%rd5], 1, 0; +; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd19, [%rd5], 1, 0; +; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd20, [%rd5], 1, 0; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd21, [%rd5], 1, 0; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd22, [%rd5], 1, 0; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd23, [%rd5], 1, 0; +; CHECK-NEXT: and.b64 %rd1, %rd5, -4; +; CHECK-NEXT: cvt.u32.u64 %r46, %rd5; +; CHECK-NEXT: and.b32 %r47, %r46, 3; +; CHECK-NEXT: shl.b32 %r1, %r47, 3; +; CHECK-NEXT: mov.b32 %r48, 65535; +; CHECK-NEXT: shl.b32 %r49, %r48, %r1; +; CHECK-NEXT: not.b32 %r2, %r49; +; CHECK-NEXT: mov.b32 %r50, 1; +; CHECK-NEXT: shl.b32 %r3, %r50, %r1; +; CHECK-NEXT: ld.shared::cluster.u32 %r51, [%rd1]; +; CHECK-NEXT: and.b32 %r64, %r51, %r2; +; CHECK-NEXT: $L__BB0_1: // %partword.cmpxchg.loop33 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b32 %r52, %r64, %r3; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r6, [%rd1], %r52, %r64; +; CHECK-NEXT: setp.eq.s32 %p3, %r6, %r52; +; CHECK-NEXT: @%p3 bra $L__BB0_3; +; CHECK-NEXT: // %bb.2: // %partword.cmpxchg.failure32 +; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: and.b32 %r7, %r6, %r2; +; CHECK-NEXT: setp.ne.s32 %p4, %r64, %r7; +; CHECK-NEXT: mov.b32 %r64, %r7; +; CHECK-NEXT: @%p4 bra $L__BB0_1; +; CHECK-NEXT: $L__BB0_3: // %partword.cmpxchg.end31 +; CHECK-NEXT: ld.shared::cluster.u32 %r53, [%rd1]; +; CHECK-NEXT: and.b32 %r65, %r53, %r2; +; CHECK-NEXT: $L__BB0_4: // %partword.cmpxchg.loop23 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b32 %r54, %r65, %r3; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r10, [%rd1], %r54, %r65; +; CHECK-NEXT: setp.eq.s32 %p5, %r10, %r54; +; CHECK-NEXT: @%p5 bra $L__BB0_6; +; CHECK-NEXT: // %bb.5: // %partword.cmpxchg.failure22 +; CHECK-NEXT: // in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: and.b32 %r11, %r10, %r2; +; CHECK-NEXT: setp.ne.s32 %p6, %r65, %r11; +; CHECK-NEXT: mov.b32 %r65, %r11; +; CHECK-NEXT: @%p6 bra $L__BB0_4; +; CHECK-NEXT: $L__BB0_6: // %partword.cmpxchg.end21 +; CHECK-NEXT: fence.acq_rel.sys; +; CHECK-NEXT: fence.acq_rel.sys; +; CHECK-NEXT: ld.shared::cluster.u32 %r55, [%rd1]; +; CHECK-NEXT: and.b32 %r66, %r55, %r2; +; CHECK-NEXT: $L__BB0_7: // %partword.cmpxchg.loop13 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b32 %r56, %r66, %r3; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r14, [%rd1], %r56, %r66; +; CHECK-NEXT: setp.eq.s32 %p7, %r14, %r56; +; CHECK-NEXT: @%p7 bra $L__BB0_9; +; CHECK-NEXT: // %bb.8: // %partword.cmpxchg.failure12 +; CHECK-NEXT: // in Loop: Header=BB0_7 Depth=1 +; CHECK-NEXT: and.b32 %r15, %r14, %r2; +; CHECK-NEXT: setp.ne.s32 %p8, %r66, %r15; +; CHECK-NEXT: mov.b32 %r66, %r15; +; CHECK-NEXT: @%p8 bra $L__BB0_7; +; CHECK-NEXT: $L__BB0_9: // %partword.cmpxchg.end11 +; CHECK-NEXT: fence.acq_rel.sys; +; CHECK-NEXT: ld.shared::cluster.u32 %r57, [%rd1]; +; CHECK-NEXT: and.b32 %r67, %r57, %r2; +; CHECK-NEXT: $L__BB0_10: // %partword.cmpxchg.loop3 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b32 %r58, %r67, %r3; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r18, [%rd1], %r58, %r67; +; CHECK-NEXT: setp.eq.s32 %p9, %r18, %r58; +; CHECK-NEXT: @%p9 bra $L__BB0_12; +; CHECK-NEXT: // %bb.11: // %partword.cmpxchg.failure2 +; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=1 +; CHECK-NEXT: and.b32 %r19, %r18, %r2; +; CHECK-NEXT: setp.ne.s32 %p10, %r67, %r19; +; CHECK-NEXT: mov.b32 %r67, %r19; +; CHECK-NEXT: @%p10 bra $L__BB0_10; +; CHECK-NEXT: $L__BB0_12: // %partword.cmpxchg.end1 +; CHECK-NEXT: fence.acq_rel.sys; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.shared::cluster.u32 %r59, [%rd1]; +; CHECK-NEXT: and.b32 %r68, %r59, %r2; +; CHECK-NEXT: $L__BB0_13: // %partword.cmpxchg.loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b32 %r60, %r68, %r3; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r22, [%rd1], %r60, %r68; +; CHECK-NEXT: setp.eq.s32 %p11, %r22, %r60; +; CHECK-NEXT: @%p11 bra $L__BB0_15; +; CHECK-NEXT: // %bb.14: // %partword.cmpxchg.failure +; CHECK-NEXT: // in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: and.b32 %r23, %r22, %r2; +; CHECK-NEXT: setp.ne.s32 %p12, %r68, %r23; +; CHECK-NEXT: mov.b32 %r68, %r23; +; CHECK-NEXT: @%p12 bra $L__BB0_13; +; CHECK-NEXT: $L__BB0_15: // %partword.cmpxchg.end +; CHECK-NEXT: fence.acq_rel.sys; +; CHECK-NEXT: selp.b32 %r61, 1, 0, %p1; +; CHECK-NEXT: selp.b32 %r62, 1, 0, %p2; +; CHECK-NEXT: add.s32 %r63, %r61, %r62; +; CHECK-NEXT: st.param.b32 [func_retval0], %r63; +; CHECK-NEXT: ret; +entry: ---------------- modiking wrote:
Good call, added https://github.com/llvm/llvm-project/pull/135444 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits