llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Mariusz Sikora (mariusz-sikora-at-amd) <details> <summary>Changes</summary> --- Patch is 112.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74836.diff 27 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+16) - (added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl (+24) - (added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl (+174) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+39) - (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+152) - (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+3) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+44-1) - (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+11-1) - (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+11) - (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h (+1) - (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+3) - (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (+1) - (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp (+1) - (modified) llvm/lib/Target/AMDGPU/SIDefines.h (+9) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+111-1) - (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+5) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+8-1) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+7) - (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+112) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+1) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp (+10) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll (+77) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll (+1366) - (modified) llvm/test/MC/AMDGPU/gfx12_asm_sop1.s (+45) - (modified) llvm/test/MC/AMDGPU/gfx12_asm_sopp.s (+9) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt (+53) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt (+9) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 8b59b3790d7bc6..7465f13d552d6e 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -406,5 +406,21 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts") +//===----------------------------------------------------------------------===// +// GFX12+ only builtins. +//===----------------------------------------------------------------------===// + +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst_var, "bi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vii", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") + + #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl new file mode 100644 index 00000000000000..5e0153c42825e3 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl @@ -0,0 +1,24 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -S -emit-llvm -o - %s + +kernel void builtins_amdgcn_s_barrier_signal_err(global int* in, global int* out, int barrier) { + + __builtin_amdgcn_s_barrier_signal(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal' must be a constant integer}} + __builtin_amdgcn_s_barrier_wait(-1); + *out = *in; +} + +kernel void builtins_amdgcn_s_barrier_wait_err(global int* in, global int* out, int barrier) { + + __builtin_amdgcn_s_barrier_signal(-1); + __builtin_amdgcn_s_barrier_wait(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_wait' must be a constant integer}} + *out = *in; +} + +kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global int* out, int barrier) { + + __builtin_amdgcn_s_barrier_signal_isfirst(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' must be a constant integer}} + __builtin_amdgcn_s_barrier_wait(-1); + *out = *in; +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl new file mode 100644 index 00000000000000..b8d281531e218e --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl @@ -0,0 +1,174 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s + +// CHECK-LABEL: @test_s_barrier_signal( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1) +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +// CHECK-NEXT: ret void +// +void test_s_barrier_signal() +{ + __builtin_amdgcn_s_barrier_signal(-1); + __builtin_amdgcn_s_barrier_wait(-1); +} + +// CHECK-LABEL: @test_s_barrier_signal_var( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +void test_s_barrier_signal_var(int a) +{ + __builtin_amdgcn_s_barrier_signal_var(a); +} + +// CHECK-LABEL: @test_s_barrier_signal_isfirst( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8 +// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1) +// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +// CHECK: if.then: +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: br label [[IF_END:%.*]] +// CHECK: if.else: +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: br label [[IF_END]] +// CHECK: if.end: +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +// CHECK-NEXT: ret void +// +void test_s_barrier_signal_isfirst(int* a, int* b, int *c) +{ + if(__builtin_amdgcn_s_barrier_signal_isfirst(1)) + a = b; + else + a = c; + + __builtin_amdgcn_s_barrier_wait(1); +} + +// CHECK-LABEL: @test_s_barrier_isfirst_var( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8 +// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8 +// CHECK-NEXT: store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 [[TMP0]]) +// CHECK-NEXT: br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +// CHECK: if.then: +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: br label [[IF_END:%.*]] +// CHECK: if.else: +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: br label [[IF_END]] +// CHECK: if.end: +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +// CHECK-NEXT: ret void +// +void test_s_barrier_isfirst_var(int* a, int* b, int *c, int d) +{ + if ( __builtin_amdgcn_s_barrier_signal_isfirst_var(d)) + a = b; + else + a = c; + + __builtin_amdgcn_s_barrier_wait(1); + +} + +// CHECK-LABEL: @test_s_barrier_init( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +void test_s_barrier_init(int a) +{ + __builtin_amdgcn_s_barrier_init(1, a); +} + +// CHECK-LABEL: @test_s_barrier_join( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1) +// CHECK-NEXT: ret void +// +void test_s_barrier_join() +{ + __builtin_amdgcn_s_barrier_join(1); +} + +// CHECK-LABEL: @test_s_wakeup_barrier( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1) +// CHECK-NEXT: ret void +// +void test_s_wakeup_barrier() +{ + __builtin_amdgcn_s_barrier_join(1); +} + +// CHECK-LABEL: @test_s_barrier_leave( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8 +// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.leave() +// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +// CHECK: if.then: +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: br label [[IF_END:%.*]] +// CHECK: if.else: +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: br label [[IF_END]] +// CHECK: if.end: +// CHECK-NEXT: ret void +// +void test_s_barrier_leave(int* a, int* b, int *c) +{ + if (__builtin_amdgcn_s_barrier_leave()) + a = b; + else + a = c; +} + +// CHECK-LABEL: @test_s_get_barrier_state( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[STATE:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 [[TMP0]]) +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[STATE]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4 +// CHECK-NEXT: ret i32 [[TMP2]] +// +unsigned test_s_get_barrier_state(int a) +{ + unsigned State = __builtin_amdgcn_s_get_barrier_state(a); + return State; +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index bc9f99783d98f2..09e88152e65d2a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -227,6 +227,45 @@ def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">, + Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">, + Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">, + Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">, + Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">, + Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">, + Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">, + Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">, + Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">, + Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index d24c7da964ce85..75fac09d0b99fa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1791,6 +1791,19 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { return true; } } + + // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait + if (STI.hasSplitBarriers()) { + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM)) + .addImm(AMDGPU::Barrier::WORKGROUP); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT)) + .addImm(AMDGPU::Barrier::WORKGROUP); + MI.eraseFromParent(); + return true; + } + return selectImpl(MI, *CoverageInfo); } @@ -2137,6 +2150,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( break; case Intrinsic::amdgcn_ds_bvh_stack_rtn: return selectDSBvhStackIntrinsic(I); + case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: + case Intrinsic::amdgcn_s_get_barrier_state: + return selectNamedBarrierInst(I, IntrinsicID); + case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: + return selectSBarrierSignalIsfirst(I, IntrinsicID); + case Intrinsic::amdgcn_s_barrier_leave: + return selectSBarrierLeave(I); } return selectImpl(I, *CoverageInfo); } @@ -5239,6 +5262,135 @@ AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const { }}; } +bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst( + MachineInstr &I, Intrinsic::ID IntrID) const { + MachineBasicBlock *MBB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register CCReg = I.getOperand(0).getReg(); + + bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var; + + if (HasM0) { + auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(I.getOperand(2).getReg()); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0)); + if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI)) + return false; + } else { + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM)) + .addImm(I.getOperand(2).getImm()); + } + + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); + + I.eraseFromParent(); + return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, + *MRI); +} + +unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { + if (HasInlineConst) { + switch (IntrID) { + default: + llvm_unreachable("not a named barrier op"); + case Intrinsic::amdgcn_s_barrier_init: + return AMDGPU::S_BARRIER_INIT_IMM; + case Intrinsic::amdgcn_s_barrier_join: + return AMDGPU::S_BARRIER_JOIN_IMM; + case Intrinsic::amdgcn_s_wakeup_barrier: + return AMDGPU::S_WAKEUP_BARRIER_IMM; + case Intrinsic::amdgcn_s_get_barrier_state: + return AMDGPU::S_GET_BARRIER_STATE_IMM; + }; + } else { + switch (IntrID) { + default: + llvm_unreachable("not a named barrier op"); + case Intrinsic::amdgcn_s_barrier_init: + return AMDGPU::S_BARRIER_INIT_M0; + case Intrinsic::amdgcn_s_barrier_join: + return AMDGPU::S_BARRIER_JOIN_M0; + case Intrinsic::amdgcn_s_wakeup_barrier: + return AMDGPU::S_WAKEUP_BARRIER_M0; + case Intrinsic::amdgcn_s_get_barrier_state: + return AMDGPU::S_GET_BARRIER_STATE_M0; + }; + } +} + +bool AMDGPUInstructionSelector::selectNamedBarrierInst( + MachineInstr &I, Intrinsic::ID IntrID) const { + MachineBasicBlock *MBB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state + ? I.getOperand(2) + : I.getOperand(1); + std::optional<int64_t> BarValImm = + getIConstantVRegSExtVal(BarOp.getReg(), *MRI); + Register M0Val; + Register TmpReg0; + + // For S_BARRIER_INIT, member count will always be read from M0[16:22] + if (IntrID == Intrinsic::amdgcn_s_barrier_init) { + Register MemberCount = I.getOperand(2).getReg(); + TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + // TODO: This should be expanded during legalization so that the the S_LSHL + // and S_OR can be constant-folded + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) + .addImm(16) + .addReg(MemberCount); + M0Val = TmpReg0; + } + + // If not inlinable, get reference to barrier depending on the instruction + if (!BarValImm) { + if (IntrID == Intrinsic::amdgcn_s_barrier_init) { + // If reference to barrier id is not an inlinable constant then it must be + // referenced with M0[4:0]. Perform an OR with the member count to include + // it in M0 for S_BARRIER_INIT. + Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1) + .addReg(BarOp.getReg()) + .addReg(TmpReg0); + M0Val = TmpReg1; + } else { + M0Val = BarOp.getReg(); + } + } + + // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required. + if (M0Val) { + auto CopyMIB = + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val); + constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); + } + + MachineInstrBuilder MIB; + unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID); + MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); + + if (IntrID == Intrinsic::amdgcn_s_get_barrier_state) + MIB.addDef(I.getOperand(0).getReg()); + + if (BarValImm) + MIB.addImm(*BarValImm); + + I.eraseFromParent(); + return true; +} +bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register CCReg = I.getOperand(0).getReg(); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE)); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); + + I.eraseFromParent(); + return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, + *MRI); +} + void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/74836 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits