https://github.com/mbrkusanin updated https://github.com/llvm/llvm-project/pull/170501
From 8caa147de93cd534f4993d29cc702627b378f60b Mon Sep 17 00:00:00 2001 From: Mirko Brkusanin <[email protected]> Date: Wed, 3 Dec 2025 16:57:48 +0100 Subject: [PATCH 1/2] [AMDGPU] Add s_wakeup_barrier instruction for gfx1250 --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 15 +++++++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 +++ .../AMDGPU/AMDGPUInstructionSelector.cpp | 14 +++++++ llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 1 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 29 +++++++++++-- llvm/lib/Target/AMDGPU/SOPInstructions.td | 14 +++++++ .../amdgpu-lower-exec-sync-and-module-lds.ll | 3 +- .../CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll | 1 - llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll | 41 +++++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s | 12 ++++++ .../Disassembler/AMDGPU/gfx1250_dasm_sop1.txt | 9 ++++ 13 files changed, 141 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 8af6ce1528a45..2ec065716d21c 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -749,6 +749,7 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst") TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vv*", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e4a5fe9014c2e..b32bcdd408512 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -1489,6 +1489,21 @@ void test_s_cluster_barrier() __builtin_amdgcn_s_cluster_barrier(); } +// CHECK-LABEL: @test_s_wakeup_barrier( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr +// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_s_wakeup_barrier(void *bar) +{ + __builtin_amdgcn_s_wakeup_barrier(bar); +} + // CHECK-LABEL: @test_global_add_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index c2057ac3a14e6..cdad810f1458d 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -314,6 +314,12 @@ def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">, Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %barrier) +// The %barrier argument must be uniform, otherwise behavior is undefined. +def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">, + Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + // void @llvm.amdgcn.s.barrier.wait(i16 %barrierType) def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">, Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index c575714cf61cd..a06c0090a7b1f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2392,6 +2392,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: return selectNamedBarrierInit(I, IntrinsicID); + case Intrinsic::amdgcn_s_wakeup_barrier: { + if (!AMDGPU::isGFX1250(STI)) { + Function &F = I.getMF()->getFunction(); + F.getContext().diagnose( + DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget", + I.getDebugLoc(), DS_Error)); + return false; + } + return selectNamedBarrierInst(I, IntrinsicID); + } case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_get_named_barrier_state: return selectNamedBarrierInst(I, IntrinsicID); @@ -6830,6 +6840,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { llvm_unreachable("not a named barrier op"); case Intrinsic::amdgcn_s_barrier_join: return AMDGPU::S_BARRIER_JOIN_IMM; + case Intrinsic::amdgcn_s_wakeup_barrier: + return AMDGPU::S_WAKEUP_BARRIER_IMM; case Intrinsic::amdgcn_s_get_named_barrier_state: return AMDGPU::S_GET_BARRIER_STATE_IMM; }; @@ -6839,6 +6851,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { llvm_unreachable("not a named barrier op"); case Intrinsic::amdgcn_s_barrier_join: return AMDGPU::S_BARRIER_JOIN_M0; + case Intrinsic::amdgcn_s_wakeup_barrier: + return AMDGPU::S_WAKEUP_BARRIER_M0; case Intrinsic::amdgcn_s_get_named_barrier_state: return AMDGPU::S_GET_BARRIER_STATE_M0; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index 8145816405915..4e16c13e30e91 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -371,6 +371,7 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { case Intrinsic::amdgcn_s_barrier_wait: case Intrinsic::amdgcn_s_barrier_leave: case Intrinsic::amdgcn_s_get_barrier_state: + case Intrinsic::amdgcn_s_wakeup_barrier: case Intrinsic::amdgcn_wave_barrier: case Intrinsic::amdgcn_sched_barrier: case Intrinsic::amdgcn_sched_group_barrier: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index a88e4b2a2a31d..d7ad08d6bc0ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3345,6 +3345,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 1); return; case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: constrainOpWithReadfirstlane(B, MI, 1); return; case Intrinsic::amdgcn_s_barrier_init: @@ -5579,6 +5580,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); break; case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); break; case Intrinsic::amdgcn_s_barrier_init: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 301f2fc8dab45..c2a0f1c5bfb8d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11503,6 +11503,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); return SDValue(NewMI, 0); } + case Intrinsic::amdgcn_s_wakeup_barrier: { + if (!AMDGPU::isGFX1250(*Subtarget)) + return SDValue(); + [[fallthrough]]; + } case Intrinsic::amdgcn_s_barrier_join: { // these three intrinsics have one operand: barrier pointer SDValue Chain = Op->getOperand(0); @@ -11512,16 +11517,32 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, if (isa<ConstantSDNode>(BarOp)) { uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue(); - Opc = AMDGPU::S_BARRIER_JOIN_IMM; - + switch (IntrinsicID) { + default: + return SDValue(); + case Intrinsic::amdgcn_s_barrier_join: + Opc = AMDGPU::S_BARRIER_JOIN_IMM; + break; + case Intrinsic::amdgcn_s_wakeup_barrier: + Opc = AMDGPU::S_WAKEUP_BARRIER_IMM; + break; + } // extract the BarrierID from bits 4-9 of the immediate unsigned BarID = (BarVal >> 4) & 0x3F; SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); Ops.push_back(K); Ops.push_back(Chain); } else { - Opc = AMDGPU::S_BARRIER_JOIN_M0; - + switch (IntrinsicID) { + default: + return SDValue(); + case Intrinsic::amdgcn_s_barrier_join: + Opc = AMDGPU::S_BARRIER_JOIN_M0; + break; + case Intrinsic::amdgcn_s_wakeup_barrier: + Opc = AMDGPU::S_WAKEUP_BARRIER_M0; + break; + } // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0] SDValue M0Val; M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 1931e0be15152..8f92dfb957b1a 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -504,6 +504,12 @@ def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins), let isConvergent = 1; } +def S_WAKEUP_BARRIER_M0 : SOP1_Pseudo <"s_wakeup_barrier m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; + let SubtargetPredicate = isGFX1250Plus; +} } // End Uses = [M0] def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs), @@ -527,6 +533,12 @@ def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs), let isConvergent = 1; } +def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; + let SubtargetPredicate = isGFX1250Plus; +} } // End has_sdst = 0 def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst), @@ -2226,6 +2238,8 @@ defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>; // GFX1250 defm S_GET_SHADER_CYCLES_U64 : SOP1_Real_gfx12<0x06>; defm S_ADD_PC_I64 : SOP1_Real_gfx12<0x04b>; +defm S_WAKEUP_BARRIER_M0 : SOP1_M0_Real_gfx12<0x057>; +defm S_WAKEUP_BARRIER_IMM : SOP1_IMM_Real_gfx12<0x057>; //===----------------------------------------------------------------------===// // SOP1 - GFX1150, GFX12 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll index bed8fa20a5044..215fb06106e11 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll @@ -112,8 +112,7 @@ attributes #2 = { nounwind readnone } ; CHECK: attributes #[[ATTR0]] = { nounwind } ; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-lds-size"="1" } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } -; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind } -; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } ;. ; CHECK: [[META0]] = !{i32 8396816, i32 8396817} ; CHECK: [[META1]] = !{i32 8396912, i32 8396913} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll index bde6db6463cb1..74e6d83ed2d94 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll @@ -96,7 +96,6 @@ attributes #2 = { nounwind readnone } ;. ; CHECK: attributes #[[ATTR0]] = { nounwind } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } -; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind } ;. ; CHECK: [[META0]] = !{i32 8396816, i32 8396817} ; CHECK: [[META1]] = !{i32 8396912, i32 8396913} diff --git a/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll new file mode 100644 index 0000000000000..b92d38cd857f2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-GISEL %s + +@bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison + +define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +; GFX1250-SDAG-LABEL: kernel1: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1250-SDAG-NEXT: s_mov_b32 m0, 1 +; GFX1250-SDAG-NEXT: s_wakeup_barrier m0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_lshr_b32 s0, s0, 4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_b32 m0, s0, 63 +; GFX1250-SDAG-NEXT: s_wakeup_barrier m0 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: kernel1: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1250-GISEL-NEXT: s_wakeup_barrier 1 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_lshr_b32 s0, s0, 4 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_and_b32 m0, s0, 63 +; GFX1250-GISEL-NEXT: s_wakeup_barrier m0 +; GFX1250-GISEL-NEXT: s_endpgm + call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar) + call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %in) + ret void +} + + +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s index cc351afd49f04..68cfdc4c01178 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s @@ -70,3 +70,15 @@ s_get_barrier_state s3, -4 s_get_barrier_state s3, m0 // GFX1250: s_get_barrier_state s3, m0 ; encoding: [0x7d,0x50,0x83,0xbe] + +s_wakeup_barrier 1 +// GFX1250: s_wakeup_barrier 1 ; encoding: [0x81,0x57,0x80,0xbe] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +s_wakeup_barrier -1 +// GFX1250: s_wakeup_barrier -1 ; encoding: [0xc1,0x57,0x80,0xbe] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +s_wakeup_barrier m0 +// GFX1250: s_wakeup_barrier m0 ; encoding: [0x7d,0x57,0x80,0xbe] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt index 34a46467c6839..1490914a5f61f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt @@ -33,3 +33,12 @@ 0x7d,0x50,0x83,0xbe # GFX1250: s_get_barrier_state s3, m0 ; encoding: [0x7d,0x50,0x83,0xbe] + +0x81,0x57,0x80,0xbe +# GFX1250: s_wakeup_barrier 1 ; encoding: [0x81,0x57,0x80,0xbe] + +0xc1,0x57,0x80,0xbe +# GFX1250: s_wakeup_barrier -1 ; encoding: [0xc1,0x57,0x80,0xbe] + +0x7d,0x57,0x80,0xbe +# GFX1250: s_wakeup_barrier m0 ; encoding: [0x7d,0x57,0x80,0xbe] From 49b95048f247698736ad04b32eeec38eeea2e144 Mon Sep 17 00:00:00 2001 From: Mirko Brkusanin <[email protected]> Date: Wed, 3 Dec 2025 17:37:41 +0100 Subject: [PATCH 2/2] Added subtarget feature: WakeupBarrier --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 +- clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl | 4 ++-- clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 ++-- llvm/lib/Target/AMDGPU/AMDGPU.td | 10 ++++++++++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 ++++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +- llvm/lib/Target/AMDGPU/SOPInstructions.td | 4 ++-- llvm/lib/TargetParser/TargetParser.cpp | 1 + 9 files changed, 24 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 2ec065716d21c..e3ff653d7aa22 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -749,7 +749,7 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst") TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts") -TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vv*", "n", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vv*", "n", "wakeup-barrier-inst") TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl index c5656c49c4761..f6312be342ca9 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl @@ -26,8 +26,8 @@ kernel void foo(global int *p) { *p = 1; } // CHECK-NEXT: ret void // //. -// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" } -// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" } +// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wakeup-barrier-inst,+wavefrontsize32" "uniform-work-group-size"="false" } +// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wakeup-barrier-inst,+wavefrontsize32" } // CHECK: attributes #[[ATTR2]] = { convergent nounwind } //. // CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index bd162b40b8e47..96f4cdfc8d1ca 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -109,8 +109,8 @@ // GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" // GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32" -// GFX1250: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" -// GFX1251: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" +// GFX1250: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wakeup-barrier-inst,+wavefrontsize32" +// GFX1251: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wakeup-barrier-inst,+wavefrontsize32" // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize64" diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index ed8ae2b16c5d4..afce9f17a3173 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1276,6 +1276,12 @@ def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst", "Has s_setprio_inc_wg instruction." >; +def FeatureWakeupBarrier : SubtargetFeature<"wakeup-barrier-inst", + "HasWakeupBarrier", + "true", + "Has s_wakeup_barrier instruction." +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -2200,6 +2206,7 @@ def FeatureISAVersion12_50_Common : FeatureSet< FeaturePkAddMinMaxInsts, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, + FeatureWakeupBarrier, Feature45BitNumRecordsBufferResource, FeatureSupportsXNACK, FeatureXNACK, @@ -3065,6 +3072,9 @@ def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic( def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">, AssemblerPredicate<(all_of FeatureSetPrioIncWgInst)>; +def HasWakeupBarrier : Predicate<"Subtarget->hasWakeupBarrier()">, + AssemblerPredicate<(all_of FeatureWakeupBarrier)>; + def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">, AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index a06c0090a7b1f..62f854fd822d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2393,7 +2393,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_s_barrier_signal_var: return selectNamedBarrierInit(I, IntrinsicID); case Intrinsic::amdgcn_s_wakeup_barrier: { - if (!AMDGPU::isGFX1250(STI)) { + if (!STI.hasWakeupBarrier()) { Function &F = I.getMF()->getFunction(); F.getContext().diagnose( DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 34eb8b2266311..27ac9e5d14462 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -289,6 +289,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; + bool HasWakeupBarrier = false; bool RequiresCOV6 = false; bool UseBlockVGPROpsForCSR = false; @@ -1612,6 +1613,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // \returns true if target has S_SETPRIO_INC_WG instruction. bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; } + // \returns true if target has S_WAKEUP_BARRIER instruction. + bool hasWakeupBarrier() const { return HasWakeupBarrier; } + // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead // of sign-extending. Note that GFX1250 has not only fixed the bug but also // extended VA to 57 bits. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c2a0f1c5bfb8d..c10ff0eb33daa 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11504,7 +11504,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(NewMI, 0); } case Intrinsic::amdgcn_s_wakeup_barrier: { - if (!AMDGPU::isGFX1250(*Subtarget)) + if (!Subtarget->hasWakeupBarrier()) return SDValue(); [[fallthrough]]; } diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 8f92dfb957b1a..a0bd298313b36 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -508,7 +508,7 @@ def S_WAKEUP_BARRIER_M0 : SOP1_Pseudo <"s_wakeup_barrier m0", (outs), (ins), "", []>{ let SchedRW = [WriteBarrier]; let isConvergent = 1; - let SubtargetPredicate = isGFX1250Plus; + let SubtargetPredicate = HasWakeupBarrier; } } // End Uses = [M0] @@ -537,7 +537,7 @@ def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs), (ins SplitBarrier:$src0), "$src0", []>{ let SchedRW = [WriteBarrier]; let isConvergent = 1; - let SubtargetPredicate = isGFX1250Plus; + let SubtargetPredicate = HasWakeupBarrier; } } // End has_sdst = 0 diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 28f3649a840d6..75fa121839a7a 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -443,6 +443,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["atomic-global-pk-add-bf16-inst"] = true; Features["atomic-ds-pk-add-16-insts"] = true; Features["setprio-inc-wg-inst"] = true; + Features["wakeup-barrier-inst"] = true; Features["atomic-fmin-fmax-global-f32"] = true; Features["atomic-fmin-fmax-global-f64"] = true; Features["wavefrontsize32"] = true; _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
