llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Mariusz Sikora (mariusz-sikora-at-amd) <details> <summary>Changes</summary> - image_atomic_pk_add_f16 - image_atomic_pk_add_bf16 - ds_pk_add_bf16 - ds_pk_add_f16 - ds_pk_add_rtn_bf16 - ds_pk_add_rtn_f16 - flat_atomic_pk_add_f16 - flat_atomic_pk_add_bf16 - global_atomic_pk_add_f16 - global_atomic_pk_add_bf16 - buffer_atomic_pk_add_f16 - buffer_atomic_pk_add_bf16 --- Patch is 97.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75917.diff 29 Files Affected: - (modified) clang/test/CodeGenOpenCL/amdgpu-features.cl (+2-2) - (added) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl (+92) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+45) - (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+4) - (modified) llvm/lib/Target/AMDGPU/AMDGPUGISel.td (+1) - (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+1) - (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (+1) - (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+23-3) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+2) - (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (+4) - (modified) llvm/lib/Target/AMDGPU/BUFInstructions.td (+21) - (modified) llvm/lib/Target/AMDGPU/DSInstructions.td (+8-4) - (modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (+4) - (modified) llvm/lib/Target/AMDGPU/MIMGInstructions.td (+2) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+23-12) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1) - (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+1) - (modified) llvm/lib/TargetParser/TargetParser.cpp (+4) - (added) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll (+433) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll (+60) - (modified) llvm/test/MC/AMDGPU/gfx11_unsupported.s (+18) - (modified) llvm/test/MC/AMDGPU/gfx12_asm_ds.s (+75) - (modified) llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s (+132) - (modified) llvm/test/MC/AMDGPU/gfx12_asm_vflat.s (+60) - (modified) llvm/test/MC/AMDGPU/gfx12_asm_vimage.s (+54) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt (+60) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt (+84) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt (+60) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt (+54) ``````````diff diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 8959634572b44e..fe1798406967e8 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -100,8 +100,8 @@ // GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1200: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1201: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl new file mode 100644 index 00000000000000..20ff12c3376370 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl @@ -0,0 +1,92 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \ +// RUN: %s -S -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX12 %s + +// REQUIRES: amdgpu-registered-target + +typedef half __attribute__((ext_vector_type(2))) half2; +typedef short __attribute__((ext_vector_type(2))) short2; + +// CHECK-LABEL: test_local_add_2bf16 +// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> % +// GFX12-LABEL: test_local_add_2bf16 +// GFX12: ds_pk_add_rtn_bf16 +short2 test_local_add_2bf16(__local short2 *addr, short2 x) { + return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: test_local_add_2bf16_noret +// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> % +// GFX12-LABEL: test_local_add_2bf16_noret +// GFX12: ds_pk_add_bf16 +void test_local_add_2bf16_noret(__local short2 *addr, short2 x) { + __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: test_local_add_2f16 +// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> % +// GFX12-LABEL: test_local_add_2f16 +// GFX12: ds_pk_add_rtn_f16 +half2 test_local_add_2f16(__local half2 *addr, half2 x) { + return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_local_add_2f16_noret +// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> % +// GFX12-LABEL: test_local_add_2f16_noret +// GFX12: ds_pk_add_f16 +void test_local_add_2f16_noret(__local half2 *addr, half2 x) { + __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_flat_add_2f16 +// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %{{.*}}, <2 x half> %{{.*}}) +// GFX12-LABEL: test_flat_add_2f16 +// GFX12: flat_atomic_pk_add_f16 +half2 test_flat_add_2f16(__generic half2 *addr, half2 x) { + return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_flat_add_2bf16 +// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}}) +// GFX12-LABEL: test_flat_add_2bf16 +// GFX12: flat_atomic_pk_add_bf16 +short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) { + return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: test_global_add_half2 +// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}}) +// GFX12-LABEL: test_global_add_half2 +// GFX12: global_atomic_pk_add_f16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN +void test_global_add_half2(__global half2 *addr, half2 x) { + half2 *rtn; + *rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_global_add_half2_noret +// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}}) +// GFX12-LABEL: test_global_add_half2_noret +// GFX12: global_atomic_pk_add_f16 v[0:1], v2, off +void test_global_add_half2_noret(__global half2 *addr, half2 x) { + __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_global_add_2bf16 +// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}}) +// GFX12-LABEL: test_global_add_2bf16 +// GFX12: global_atomic_pk_add_bf16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN +void test_global_add_2bf16(__global short2 *addr, short2 x) { + short2 *rtn; + *rtn = __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: test_global_add_2bf16_noret +// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}}) +// GFX12-LABEL: test_global_add_2bf16_noret +// GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off +void test_global_add_2bf16_noret(__global short2 *addr, short2 x) { + __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 51bd9b63c127ed..bea39743525b23 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1026,6 +1026,9 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = { defm int_amdgcn_image_atomic_cmpswap : AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">, AMDGPUArg<LLVMMatchType<0>, "cmp">]>; + + defm int_amdgcn_image_atomic_pk_add_f16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_F16">; + defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimAtomic<"ATOMIC_PK_ADD_BF16">; } ////////////////////////////////////////////////////////////////////////// @@ -1294,6 +1297,26 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< // gfx908 intrinsic def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; +// gfx12+ intrinsic +def int_amdgcn_raw_buffer_atomic_fadd_v2bf16 : Intrinsic < + [llvm_v2i16_ty], + [llvm_v2i16_ty, + llvm_v4i32_ty, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty], + [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; +def int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic < + [llvm_v2i16_ty], + [llvm_v2i16_ty, + AMDGPUBufferRsrcTy, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty], + [IntrArgMemOnly, NoCapture<ArgIndex<1>>, + ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < [data_ty], @@ -1368,6 +1391,28 @@ def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< // gfx908 intrinsic def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; +// gfx12 intrinsic +def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic < + [llvm_v2i16_ty], + [llvm_v2i16_ty, + llvm_v4i32_ty, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty], + [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; +def int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic < + [llvm_v2i16_ty], + [llvm_v2i16_ty, + AMDGPUBufferRsrcTy, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty], + [IntrArgMemOnly, NoCapture<ArgIndex<1>>, + ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; // gfx90a intrinsics def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 060fb66d38f7bc..14b2155938eef6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1473,6 +1473,10 @@ def FeatureISAVersion12 : FeatureSet< FeatureArchitectedFlatScratch, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, + FeatureAtomicDsPkAdd16Insts, + FeatureAtomicFlatPkAdd16Insts, + FeatureAtomicBufferGlobalPkAddF16Insts, + FeatureAtomicGlobalPkAddBF16Inst, FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeatureExtendedImageInsts, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 2b85024a9b40be..f426b7f38428da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -261,6 +261,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD_BF16, SIbuffer_atomic_fadd_bf16>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 156a264a7c1faa..8205bdac4e2c5d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5425,6 +5425,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) + NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16) NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 827fb106b55199..8d972c46447b8b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -587,6 +587,7 @@ enum NodeType : unsigned { BUFFER_ATOMIC_CMPSWAP, BUFFER_ATOMIC_CSUB, BUFFER_ATOMIC_FADD, + BUFFER_ATOMIC_FADD_BF16, BUFFER_ATOMIC_FMIN, BUFFER_ATOMIC_FMAX, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index fbee2888945185..32964e5c2e3ef1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5872,6 +5872,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; + case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16; case Intrinsic::amdgcn_raw_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: case Intrinsic::amdgcn_struct_buffer_atomic_fmin: @@ -6079,6 +6082,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); LLT Ty = MRI->getType(VData); + const bool IsAtomicPacked16Bit = + (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || + BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); + // Check for 16 bit addresses and pack if true. LLT GradTy = MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); @@ -6087,7 +6094,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( const bool IsG16 = ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; const bool IsA16 = AddrTy == S16; - const bool IsD16 = Ty.getScalarType() == S16; + const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16; int DMaskLanes = 0; if (!BaseOpcode->Atomic) { @@ -6129,7 +6136,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( LLT Ty = MRI->getType(VData0); // TODO: Allow atomic swap and bit ops for v2s16/v4s16 - if (Ty.isVector()) + if (Ty.isVector() && !IsAtomicPacked16Bit) return false; if (BaseOpcode->AtomicX2) { @@ -6265,9 +6272,18 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( if (NumElts > 4 || DMaskLanes > 4) return false; + // Image atomic instructions are using DMask to specify how many bits + // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16). + // DMaskLanes for image atomic has default value '0'. + // We must be sure that atomic variants (especially packed) will not be + // truncated from v2s16 or v4s16 to s16 type. + // + // ChangeElementCount will be needed for image load where Ty is always scalar. const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; const LLT AdjustedTy = - Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); + DMaskLanes == 0 + ? Ty + : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); // The raw dword aligned data component of the load. The only legal cases // where this matters should be when using the packed D16 format, for @@ -7069,6 +7085,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: + case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16: return legalizeBufferAtomic(MI, B, IntrID); case Intrinsic::trap: return legalizeTrapIntrinsic(MI, MRI, B); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c9412f720c62ec..d8d19f65190bb4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2920,6 +2920,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { applyDefaultMapping(OpdMapper); @@ -4200,6 +4201,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { // vdata_out diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index beb670669581f1..f4415aaa6b1ff9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -278,6 +278,7 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>; @@ -294,6 +295,7 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>; @@ -310,6 +312,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>; @@ -326,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 43d35fa5291ca0..2a6ad3b0d4a25b 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1245,6 +1245,13 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag >; +let SubtargetPredicate = isGFX12Plus in { +let FPAtomic = 1 in +defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics < + "buffer_atomic_pk_add_bf16", VGPR_32, v2i16 +>; +} + //===----------------------------------------------------------------------===// // MTBUF Instructions //===----------------------------------------------------------------------===// @@ -1708,6 +1715,10 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">; let SubtargetPredicate = HasAtomicCSubNoRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>; +let SubtargetPredicate = isGFX12Plus in { + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2i16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">; +} + let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32,... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/75917 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits