Author: Changpeng Fang Date: 2025-07-01T11:08:49-07:00 New Revision: 5035d20dcbcea1edced779148ac69b84d3c97577
URL: https://github.com/llvm/llvm-project/commit/5035d20dcbcea1edced779148ac69b84d3c97577 DIFF: https://github.com/llvm/llvm-project/commit/5035d20dcbcea1edced779148ac69b84d3c97577.diff LOG: AMDGPU: Implement ds_atomic_async_barrier_arrive_b64/ds_atomic_barrier_arrive_rtn_b64 (#146409) These two instructions are supported by gfx1250. We define the instructions and implement the corresponding intrinsic and builtin. Co-authored-by: Stanislav Mekhanoshin <stanislav.mekhanos...@amd.com> Added: clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.atomic.async.barrier.arrive.b64.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.atomic.barrier.arrive.rtn.b64.ll llvm/test/MC/AMDGPU/gfx1250_asm_ds.s llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_ds.txt Modified: clang/include/clang/Basic/BuiltinsAMDGPU.def llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/Target/AMDGPU/AMDGPU.td llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp llvm/lib/Target/AMDGPU/DSInstructions.td llvm/lib/Target/AMDGPU/GCNSubtarget.h llvm/lib/Target/AMDGPU/SIDefines.h llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/lib/Target/AMDGPU/SIInstrFormats.td llvm/lib/Target/AMDGPU/SIInstrInfo.cpp llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp llvm/lib/Target/AMDGPU/SIRegisterInfo.td Removed: ################################################################################ diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 0104482e9117f..5133947c498ca 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -653,6 +653,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8i16, "V8sV8s*3", "nc", "gfx1 TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8f16, "V8hV8h*3", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64, "vLi*3", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "nc", "gfx1250-insts") + TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst") TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl new file mode 100644 index 0000000000000..e3fe31ff7dd75 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl @@ -0,0 +1,24 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250 + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_atomic_async_barrier_arrive_b64( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.ds.atomic.async.barrier.arrive.b64(ptr addrspace(3) [[ADDR:%.*]]) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_ds_atomic_async_barrier_arrive_b64(local long* addr) +{ + __builtin_amdgcn_ds_atomic_async_barrier_arrive_b64(addr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_atomic_barrier_arrive_rtn_b64( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.amdgcn.ds.atomic.barrier.arrive.rtn.b64(ptr addrspace(3) [[ADDR:%.*]], i64 [[DATA:%.*]]) +// CHECK-GFX1250-NEXT: store i64 [[TMP0]], ptr [[OUT:%.*]], align 8, !tbaa [[TBAA4:![0-9]+]] +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_ds_atomic_barrier_arrive_rtn_b64(local long* addr, long data, long *out) +{ + *out = __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64(addr, data); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index c09c3630719f4..a0a81568424f5 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3504,6 +3504,19 @@ def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">, // gfx1250 intrinsics // ===----------------------------------------------------------------------===// +def int_amdgcn_ds_atomic_async_barrier_arrive_b64 : + ClangBuiltin<"__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64">, + Intrinsic<[], [local_ptr_ty], + // Atomically updates LDS and also ASYNC_CNT which is modeled as InaccessibleMem. + [IntrConvergent, IntrWillReturn, IntrInaccessibleMemOrArgMemOnly], + "", [SDNPMemOperand]>; + +def int_amdgcn_ds_atomic_barrier_arrive_rtn_b64 : + ClangBuiltin<"__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64">, + Intrinsic<[llvm_i64_ty], [local_ptr_ty, llvm_i64_ty], + [IntrConvergent, IntrWillReturn, IntrArgMemOnly, NoCapture<ArgIndex<0>>], + "", [SDNPMemOperand]>; + def int_amdgcn_s_monitor_sleep : ClangBuiltin<"__builtin_amdgcn_s_monitor_sleep">, DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 72d6a78539ada..1a1c32fba9d18 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1319,6 +1319,12 @@ def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "The platform has memory to lds instructions (global_load w/lds bit set, buffer_load w/lds bit set or global_load_lds. This does not include scratch_load_lds." >; +def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic", + "HasLdsBarrierArriveAtomic", + "true", + "Has LDS barrier-arrive atomic instructions" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -1955,6 +1961,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureMemoryAtomicFAddF32DenormalSupport, FeatureKernargPreload, FeatureLshlAddU64Inst, + FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, ]>; @@ -2687,6 +2694,9 @@ def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">, def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>; +def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">, + AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>; + def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">, AssemblerPredicate<(all_of FeatureSetPrioIncWgInst)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 778d257c88a38..6f6d7b8d99af5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5117,6 +5117,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_ds_read_tr6_b96: case Intrinsic::amdgcn_ds_read_tr8_b64: case Intrinsic::amdgcn_ds_read_tr16_b64: + case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64: + case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 44b03226738eb..e219fe05f881b 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -354,6 +354,20 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName, let gdsValue = 1; } +class DS_1A_Off16_NORET <string opName> +: DS_Pseudo<opName, + (outs), + (ins VGPR_32:$addr, Offset:$offset, gds:$gds), + " $addr$offset$gds"> { + + let has_vdst = 0; + let has_offset = 1; + let has_data0 = 0; + let has_data1 = 0; + let has_m0_read = 0; + let IsAtomicNoRet = 1; +} + class DS_0A_RET <string opName> : DS_Pseudo<opName, (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst), (ins Offset:$offset, gds:$gds), @@ -794,6 +808,24 @@ defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VReg_64>; defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>; } // End WaveSizePredicate = isWave32, mayStore = 0 +let OtherPredicates = [HasLdsBarrierArriveAtomic] in { +let ASYNC_CNT = 1, LGKM_CNT = 0, Uses = [EXEC, ASYNCcnt], Defs = [ASYNCcnt] in { +def DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_1A_Off16_NORET<"ds_atomic_async_barrier_arrive_b64">; +} + +def : GCNPat < + (int_amdgcn_ds_atomic_async_barrier_arrive_b64 (DS1Addr1Offset i32:$ptr, i32:$offset)), + (DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 VGPR_32:$ptr, Offset:$offset, (i1 0)) +>; + +defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_1A1D_RET_mc_gfx9<"ds_atomic_barrier_arrive_rtn_b64", VReg_64>; + +def : GCNPat< + (i64 (int_amdgcn_ds_atomic_barrier_arrive_rtn_b64 (DS1Addr1Offset i32:$ptr, i32:$offset), i64:$data)), + (DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 $ptr, $data, Offset:$offset, (i1 0)) +>; +} // End OtherPredicates = [HasLdsBarrierArriveAtomic] + } // End SubtargetPredicate = isGFX1250Plus let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in { @@ -1366,6 +1398,11 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0, defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>; defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>; +let AssemblerPredicate = HasLdsBarrierArriveAtomic in { +defm DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_Real_gfx12<0x056>; +defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_Real_gfx12<0x075>; +} + // New aliases added in GFX12 without renaming the instructions. let AssemblerPredicate = isGFX12Plus in { def : AMDGPUMnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 2f79599091faf..45721422edcf1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -263,6 +263,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMinimum3Maximum3PKF16 = false; bool HasLshlAddU64Inst = false; bool HasPointSampleAccel = false; + bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; bool RequiresCOV6 = false; @@ -1381,6 +1382,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasPointSampleAccel() const { return HasPointSampleAccel; } + bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; } + /// \returns The maximum number of instructions that can be enclosed in an /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that /// instruction. diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index baf74dbdde20e..4b72f66abbd76 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -113,8 +113,7 @@ enum : uint64_t { SCALAR_STORE = UINT64_C(1) << 39, FIXED_SIZE = UINT64_C(1) << 40, - // Reserved, must be 0. - Reserved1 = UINT64_C(1) << 41, + ASYNC_CNT = UINT64_C(1) << 41, VOP3_OPSEL = UINT64_C(1) << 42, maybeAtomic = UINT64_C(1) << 43, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index bb1de58e04fbc..b083a9014737b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1404,6 +1404,19 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } + case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64: + case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: { + Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64) + ? ISD::INTRINSIC_W_CHAIN + : ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.memVT = MVT::i64; + Info.size = 8; + Info.align.reset(); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } case Intrinsic::amdgcn_global_atomic_csub: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); @@ -1564,6 +1577,8 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_ds_read_tr16_b64: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: + case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64: + case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_csub: diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 267c9a94b9096..42aae35112cac 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -77,6 +77,9 @@ class InstSI <dag outs, dag ins, string asm = "", // instruction size. field bit FixedSize = 0; + // wait count to manage asynchronous loads/stores. + field bit ASYNC_CNT = 0; + // This bit indicates that this is a VOP3 opcode which supports op_sel // modifier. field bit VOP3_OPSEL = 0; @@ -204,8 +207,7 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{39} = ScalarStore; let TSFlags{40} = FixedSize; - // Reserved, must be 0. - let TSFlags{41} = 0; + let TSFlags{41} = ASYNC_CNT; let TSFlags{42} = VOP3_OPSEL; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5962556db62eb..266b6aae69c5a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -387,7 +387,10 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth( DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); if (DataOpIdx == -1) DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); - Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx)); + if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64) + Width = LocationSize::precise(64); + else + Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx)); } else { // The 2 offset instructions use offset0 and offset1 instead. We can treat // these as a load with a single offset if the 2 offsets are consecutive. diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 8c3873d23419f..30b75e6fd78cd 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -656,6 +656,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); + // Reserve async counters pseudo registers + reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt); + // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index d595163f820cb..47d3f548a20c0 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -234,6 +234,9 @@ defm SRC_POPS_EXITING_WAVE_ID : SIRegLoHi16<"src_pops_exiting_wave_id", 239>; // Not addressable def MODE : SIReg <"mode", 0>; +// Not addressable, used to model dependencies. +def ASYNCcnt : SIReg <"ASYNCcnt", 0>; + def LDS_DIRECT : SIReg <"src_lds_direct", 254> { // There is no physical register corresponding to this. This is an // encoding value in a source field, which will ultimately trigger a diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.atomic.async.barrier.arrive.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.atomic.async.barrier.arrive.b64.ll new file mode 100644 index 0000000000000..9e0bd5e24e7bc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.atomic.async.barrier.arrive.b64.ll @@ -0,0 +1,21 @@ +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s + +declare void @llvm.amdgcn.ds.atomic.async.barrier.arrive.b64(ptr addrspace(3)) + +; GCN-LABEL: {{^}}test_ds_atomic_async_barrier_arrive_b64: +; GCN: ds_atomic_async_barrier_arrive_b64 v0{{$}} +define void @test_ds_atomic_async_barrier_arrive_b64(ptr addrspace(3) %bar) { +entry: + call void @llvm.amdgcn.ds.atomic.async.barrier.arrive.b64(ptr addrspace(3) %bar) + ret void +} + +; GCN-LABEL: {{^}}test_ds_atomic_async_barrier_arrive_b64_off: +; GCN: ds_atomic_async_barrier_arrive_b64 v0 offset:8184{{$}} +define void @test_ds_atomic_async_barrier_arrive_b64_off(ptr addrspace(3) %in) { +entry: + %bar = getelementptr i64, ptr addrspace(3) %in, i32 1023 + call void @llvm.amdgcn.ds.atomic.async.barrier.arrive.b64(ptr addrspace(3) %bar) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.atomic.barrier.arrive.rtn.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.atomic.barrier.arrive.rtn.b64.ll new file mode 100644 index 0000000000000..8c7c760aa5010 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.atomic.barrier.arrive.rtn.b64.ll @@ -0,0 +1,27 @@ +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s + +declare i64 @llvm.amdgcn.ds.atomic.barrier.arrive.rtn.b64(ptr addrspace(3), i64) + +; GCN-LABEL: {{^}}test_ds_atomic_barrier_arrive_rtn_b64: +; GCN: ds_atomic_barrier_arrive_rtn_b64 v[{{[0-9:]+}}], v2, v[0:1]{{$}} +; GCN: s_wait_dscnt 0x0 +; GCN: flat_store_b64 +define void @test_ds_atomic_barrier_arrive_rtn_b64(i64 %data, ptr addrspace(3) %bar, ptr %out) { +entry: + %ret = call i64 @llvm.amdgcn.ds.atomic.barrier.arrive.rtn.b64(ptr addrspace(3) %bar, i64 %data) + store i64 %ret, ptr %out + ret void +} + +; GCN-LABEL: {{^}}test_ds_atomic_barrier_arrive_rtn_b64_off: +; GCN: ds_atomic_barrier_arrive_rtn_b64 v[{{[0-9:]+}}], v0, v[{{[0-9:]+}}] offset:8184{{$}} +; GCN: s_wait_dscnt 0x0 +; GCN: flat_store_b64 +define void @test_ds_atomic_barrier_arrive_rtn_b64_off(ptr addrspace(3) %in, ptr %out) { +entry: + %bar = getelementptr i64, ptr addrspace(3) %in, i32 1023 + %ret = call i64 @llvm.amdgcn.ds.atomic.barrier.arrive.rtn.b64(ptr addrspace(3) %bar, i64 512) + store i64 %ret, ptr %out + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_ds.s b/llvm/test/MC/AMDGPU/gfx1250_asm_ds.s new file mode 100644 index 0000000000000..98436c9d6aa9c --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_ds.s @@ -0,0 +1,26 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR %s + +ds_atomic_async_barrier_arrive_b64 v1 offset:65407 +// GFX1250: ds_atomic_async_barrier_arrive_b64 v1 offset:65407 ; encoding: [0x7f,0xff,0x58,0xd9,0x01,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_atomic_async_barrier_arrive_b64 v255 offset:1040 +// GFX1250: ds_atomic_async_barrier_arrive_b64 v255 offset:1040 ; encoding: [0x10,0x04,0x58,0xd9,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_atomic_async_barrier_arrive_b64 v5 +// GFX1250: ds_atomic_async_barrier_arrive_b64 v5 ; encoding: [0x00,0x00,0x58,0xd9,0x05,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_atomic_barrier_arrive_rtn_b64 v[2:3], v2, v[4:5] +// GFX1250: ds_atomic_barrier_arrive_rtn_b64 v[2:3], v2, v[4:5] ; encoding: [0x00,0x00,0xd4,0xd9,0x02,0x04,0x00,0x02] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_atomic_barrier_arrive_rtn_b64 v[2:3], v2, v[4:5] offset:513 +// GFX1250: ds_atomic_barrier_arrive_rtn_b64 v[2:3], v2, v[4:5] offset:513 ; encoding: [0x01,0x02,0xd4,0xd9,0x02,0x04,0x00,0x02] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_atomic_barrier_arrive_rtn_b64 v[254:255], v2, v[4:5] offset:65535 +// GFX1250: ds_atomic_barrier_arrive_rtn_b64 v[254:255], v2, v[4:5] offset:65535 ; encoding: [0xff,0xff,0xd4,0xd9,0x02,0x04,0x00,0xfe] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_ds.txt new file mode 100644 index 0000000000000..e03c4327d9814 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_ds.txt @@ -0,0 +1,19 @@ +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX1250 %s + +# GFX1250: ds_atomic_async_barrier_arrive_b64 v1 offset:65407 ; encoding: [0x7f,0xff,0x58,0xd9,0x01,0x00,0x00,0x00] +0x7f,0xff,0x58,0xd9,0x01,0x00,0x00,0x00 + +# GFX1250: ds_atomic_async_barrier_arrive_b64 v255 offset:1040 ; encoding: [0x10,0x04,0x58,0xd9,0xff,0x00,0x00,0x00] +0x10,0x04,0x58,0xd9,0xff,0x00,0x00,0x00 + +# GFX1250: ds_atomic_async_barrier_arrive_b64 v5 ; encoding: [0x00,0x00,0x58,0xd9,0x05,0x00,0x00,0x00] +0x00,0x00,0x58,0xd9,0x05,0x00,0x00,0x00 + +# GFX1250: ds_atomic_barrier_arrive_rtn_b64 v[254:255], v2, v[4:5] offset:65535 ; encoding: [0xff,0xff,0xd4,0xd9,0x02,0x04,0x00,0xfe] +0xff,0xff,0xd4,0xd9,0x02,0x04,0x00,0xfe + +# GFX1250: ds_atomic_barrier_arrive_rtn_b64 v[2:3], v2, v[4:5] ; encoding: [0x00,0x00,0xd4,0xd9,0x02,0x04,0x00,0x02] +0x00,0x00,0xd4,0xd9,0x02,0x04,0x00,0x02 + +# GFX1250: ds_atomic_barrier_arrive_rtn_b64 v[2:3], v2, v[4:5] offset:513 ; encoding: [0x01,0x02,0xd4,0xd9,0x02,0x04,0x00,0x02] +0x01,0x02,0xd4,0xd9,0x02,0x04,0x00,0x02 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits