[libcxxabi] [flang] [clang-tools-extra] [libcxx] [lldb] [clang] [llvm] [compiler-rt] [lld] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/74576 >From 23759746b66c33028ad2340b1e98067ebf1f8074 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 28 Jun 2022 15:24:24 -0700 Subject: [PATCH] [AMDGPU] GFX12: select @llvm.prefetch intrinsic --- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 21 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 22 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 2 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 2 + llvm/lib/Target/AMDGPU/SIInstructions.td | 12 + llvm/lib/Target/AMDGPU/SMInstructions.td | 34 ++ llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 496 ++ 8 files changed, 591 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 62996a3b3fb79..f0b3ed7adc294 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3101,6 +3101,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 8); // M0 return; +case Intrinsic::prefetch: { + if (!Subtarget.hasPrefetch()) { +MI.eraseFromParent(); +return; + } + unsigned PtrBank = + getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID); + if (PtrBank == AMDGPU::VGPRRegBankID) { +MI.eraseFromParent(); +return; + } + // FIXME: There is currently no support for prefetch in global isel. + // There is no node equivalence and what's worse there is no MMO produced + // for a prefetch on global isel path. + // Prefetch does not affect execution so erase it for now. + MI.eraseFromParent(); + return; +} default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -4830,6 +4848,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1 break; } +case Intrinsic::prefetch: + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + break; default: return getInvalidInstructionMapping(); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 94b9e49b765a6..21a9b8147034f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -828,6 +828,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasInstPrefetch() const { return getGeneration() >= GFX10; } + bool hasPrefetch() const { return GFX12Insts; } + // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspective of an arbitrary workitem, this // is 4-byte aligned. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a7f4d63229b7e..93af38d877c5d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -763,6 +763,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); + if (Subtarget->hasPrefetch()) +setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, MVT::v2i16, MVT::v2f16, MVT::i128}, @@ -3858,6 +3861,23 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL); } +SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { + if (Op->isDivergent()) +return SDValue(); + + switch (cast(Op)->getAddressSpace()) { + case AMDGPUAS::FLAT_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: +break; + default: +return SDValue(); + } + + return Op; +} + Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch(RegName) @@ -5395,6 +5415,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerSTACKSAVE(Op, DAG); case ISD::GET_ROUNDING: return lowerGET_ROUNDING(Op, DAG); + case ISD::PREFETCH: +return lowerPREFETCH(Op, DAG); } return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index c9cc149218a99..5bc091d6e84de 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
[libcxxabi] [flang] [lld] [llvm] [compiler-rt] [lldb] [clang] [clang-tools-extra] [libcxx] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)
@@ -959,6 +967,32 @@ def : GCNPat < } } // let OtherPredicates = [HasShaderCyclesRegister] +def SIMM24bitPtr : ImmLeaf (Imm);}] +>; + +multiclass SMPrefetchPat { + def : GCNPat < +(smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)), +(!cast("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0)) + >; + + def : GCNPat < +(smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)), +(!cast("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0)) + >; + + def : GCNPat < +(prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)), +(!cast("S_PREFETCH_"#type#"_PC_REL") (as_i32timm $offset), (i32 SGPR_NULL), (i8 0)) + > { +let AddedComplexity = 10; + } mariusz-sikora-at-amd wrote: Maybe for now I will remove PC_REL part. https://github.com/llvm/llvm-project/pull/74576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[libcxx] [flang] [clang-tools-extra] [libcxxabi] [compiler-rt] [clang] [lldb] [lld] [llvm] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/74576 >From 23759746b66c33028ad2340b1e98067ebf1f8074 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 28 Jun 2022 15:24:24 -0700 Subject: [PATCH 1/2] [AMDGPU] GFX12: select @llvm.prefetch intrinsic --- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 21 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 22 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 2 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 2 + llvm/lib/Target/AMDGPU/SIInstructions.td | 12 + llvm/lib/Target/AMDGPU/SMInstructions.td | 34 ++ llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 496 ++ 8 files changed, 591 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 62996a3b3fb79..f0b3ed7adc294 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3101,6 +3101,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 8); // M0 return; +case Intrinsic::prefetch: { + if (!Subtarget.hasPrefetch()) { +MI.eraseFromParent(); +return; + } + unsigned PtrBank = + getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID); + if (PtrBank == AMDGPU::VGPRRegBankID) { +MI.eraseFromParent(); +return; + } + // FIXME: There is currently no support for prefetch in global isel. + // There is no node equivalence and what's worse there is no MMO produced + // for a prefetch on global isel path. + // Prefetch does not affect execution so erase it for now. + MI.eraseFromParent(); + return; +} default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -4830,6 +4848,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1 break; } +case Intrinsic::prefetch: + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + break; default: return getInvalidInstructionMapping(); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 94b9e49b765a6..21a9b8147034f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -828,6 +828,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasInstPrefetch() const { return getGeneration() >= GFX10; } + bool hasPrefetch() const { return GFX12Insts; } + // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspective of an arbitrary workitem, this // is 4-byte aligned. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a7f4d63229b7e..93af38d877c5d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -763,6 +763,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); + if (Subtarget->hasPrefetch()) +setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, MVT::v2i16, MVT::v2f16, MVT::i128}, @@ -3858,6 +3861,23 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL); } +SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { + if (Op->isDivergent()) +return SDValue(); + + switch (cast(Op)->getAddressSpace()) { + case AMDGPUAS::FLAT_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: +break; + default: +return SDValue(); + } + + return Op; +} + Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch(RegName) @@ -5395,6 +5415,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerSTACKSAVE(Op, DAG); case ISD::GET_ROUNDING: return lowerGET_ROUNDING(Op, DAG); + case ISD::PREFETCH: +return lowerPREFETCH(Op, DAG); } return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index c9cc149218a99..5bc091d6e84de 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowerin
[clang] [llvm] [AMDGPU] GFX12: Add Split Workgroup Barrier (PR #74836)
mariusz-sikora-at-amd wrote: ping https://github.com/llvm/llvm-project/pull/74836 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[compiler-rt] [clang-tools-extra] [libcxx] [llvm] [flang] [clang] [libc] [AMDGPU] Update VOP instructions for GFX12 (PR #74853)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/74853 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [mlir] [flang] [lld] [libc] [llvm] [compiler-rt] [libcxx] [lldb] [clang] [AMDGPU] GFX12: Add Split Workgroup Barrier (PR #74836)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/74836 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][GFX12] Add new v_permlane16 variants (PR #75475)
https://github.com/mariusz-sikora-at-amd created https://github.com/llvm/llvm-project/pull/75475 None >From c878aa8f2e331cf8c88ab6e191db663ed56d9ce7 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Thu, 14 Dec 2023 11:13:36 +0100 Subject: [PATCH] [AMDGPU][GFX12] Add new v_permlane16 variants --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 + .../CodeGenOpenCL/builtins-amdgcn-gfx12.cl| 48 + .../builtins-amdgcn-error-gfx12-param.cl | 14 + .../SemaOpenCL/builtins-amdgcn-error-gfx12.cl | 16 + llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 18 + .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 19 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 9 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 2 + .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 4 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 4 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td| 30 + .../UniformityAnalysis/AMDGPU/intrinsics.ll | 16 + .../AMDGPU/llvm.amdgcn.permlane16.var.ll | 896 ++ .../CodeGen/AMDGPU/permlane16_var-op-sel.ll | 15 + .../AMDGPU/vcmpx-permlane16var-hazard.mir | 168 llvm/test/MC/AMDGPU/gfx11_unsupported.s | 6 + llvm/test/MC/AMDGPU/gfx12_asm_vop3.s | 51 + llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s | 95 ++ .../Disassembler/AMDGPU/gfx12_dasm_vop3.txt | 51 + 19 files changed, 1459 insertions(+), 5 deletions(-) create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx12-param.cl create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx12.cl create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll create mode 100644 llvm/test/CodeGen/AMDGPU/permlane16_var-op-sel.ll create mode 100644 llvm/test/CodeGen/AMDGPU/vcmpx-permlane16var-hazard.mir diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 7465f13d552d6e..e562ef04a30194 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -410,6 +410,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts") // GFX12+ only builtins. //===--===// +TARGET_BUILTIN(__builtin_amdgcn_permlane16_var, "UiUiUiUiIbIb", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_permlanex16_var, "UiUiUiUiIbIb", "nc", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl index b8d281531e218e..2899d9e5c28898 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl @@ -1,6 +1,54 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s +// REQUIRES: amdgpu-registered-target + +typedef unsigned int uint; + +// CHECK-LABEL: @test_permlane16_var( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT:[[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT:[[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT:store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT:store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4 +// CHECK-NEXT:store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT:[[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4 +// CHECK-NEXT:[[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4 +// CHECK-NEXT:[[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane16.var(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false, i1 false) +// CHECK-NEXT:[[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT:store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT:ret void +// +void test_permlane16_var(global uint* out, uint a, uint b, uint c) { + *out = __builtin_amdgcn_permlane16_var(a, b, c, 0, 0); +} + +// CHECK-LABEL: @test_permlanex16_var( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT:[[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT:[[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5
[llvm] [libcxx] [clang] [clang-tools-extra] [lldb] [libc] [flang] [mlir] [compiler-rt] [lld] [AMDGPU] GFX12: Add Split Workgroup Barrier (PR #74836)
@@ -684,6 +684,51 @@ s_rndne_f16 s5, 0xfe0b s_rndne_f16 s5, 0x3456 // GFX12: encoding: [0xff,0x6e,0x85,0xbe,0x56,0x34,0x00,0x00] +s_barrier_signal -2 mariusz-sikora-at-amd wrote: Thanks ! https://github.com/llvm/llvm-project/pull/74836 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][GFX12] Add new v_permlane16 variants (PR #75475)
mariusz-sikora-at-amd wrote: > LGTM > > You could also update existing permlane tests with run lines for gfx12: > > * test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll > > * test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir > > > This can also be a separate patch. New patch: https://github.com/llvm/llvm-project/pull/75572 https://github.com/llvm/llvm-project/pull/75475 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][GFX12] Add new v_permlane16 variants (PR #75475)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/75475 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang-tools-extra] [libc] [compiler-rt] [libcxx] [lldb] [mlir] [flang] [lld] [AMDGPU] GFX12: Add Split Workgroup Barrier (PR #74836)
@@ -684,6 +684,51 @@ s_rndne_f16 s5, 0xfe0b s_rndne_f16 s5, 0x3456 // GFX12: encoding: [0xff,0x6e,0x85,0xbe,0x56,0x34,0x00,0x00] +s_barrier_signal -2 mariusz-sikora-at-amd wrote: Patch: https://github.com/llvm/llvm-project/pull/75575 https://github.com/llvm/llvm-project/pull/74836 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[flang] [clang] [lldb] [libcxxabi] [lld] [compiler-rt] [clang-tools-extra] [llvm] [libcxx] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/74576 >From 23759746b66c33028ad2340b1e98067ebf1f8074 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 28 Jun 2022 15:24:24 -0700 Subject: [PATCH 1/4] [AMDGPU] GFX12: select @llvm.prefetch intrinsic --- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 21 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 22 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 2 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 2 + llvm/lib/Target/AMDGPU/SIInstructions.td | 12 + llvm/lib/Target/AMDGPU/SMInstructions.td | 34 ++ llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 496 ++ 8 files changed, 591 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 62996a3b3fb79f..f0b3ed7adc294c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3101,6 +3101,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 8); // M0 return; +case Intrinsic::prefetch: { + if (!Subtarget.hasPrefetch()) { +MI.eraseFromParent(); +return; + } + unsigned PtrBank = + getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID); + if (PtrBank == AMDGPU::VGPRRegBankID) { +MI.eraseFromParent(); +return; + } + // FIXME: There is currently no support for prefetch in global isel. + // There is no node equivalence and what's worse there is no MMO produced + // for a prefetch on global isel path. + // Prefetch does not affect execution so erase it for now. + MI.eraseFromParent(); + return; +} default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -4830,6 +4848,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1 break; } +case Intrinsic::prefetch: + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + break; default: return getInvalidInstructionMapping(); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 94b9e49b765a6f..21a9b8147034fc 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -828,6 +828,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasInstPrefetch() const { return getGeneration() >= GFX10; } + bool hasPrefetch() const { return GFX12Insts; } + // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspective of an arbitrary workitem, this // is 4-byte aligned. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a7f4d63229b7ef..93af38d877c5d4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -763,6 +763,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); + if (Subtarget->hasPrefetch()) +setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, MVT::v2i16, MVT::v2f16, MVT::i128}, @@ -3858,6 +3861,23 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL); } +SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { + if (Op->isDivergent()) +return SDValue(); + + switch (cast(Op)->getAddressSpace()) { + case AMDGPUAS::FLAT_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: +break; + default: +return SDValue(); + } + + return Op; +} + Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch(RegName) @@ -5395,6 +5415,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerSTACKSAVE(Op, DAG); case ISD::GET_ROUNDING: return lowerGET_ROUNDING(Op, DAG); + case ISD::PREFETCH: +return lowerPREFETCH(Op, DAG); } return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index c9cc149218a997..5bc091d6e84de3 100644 --- a/llvm/lib/Target/AMDGPU/SIISe
[llvm] [lld] [clang] [compiler-rt] [libcxxabi] [lldb] [clang-tools-extra] [flang] [libcxx] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/74576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [lldb] [libc] [clang] [flang] [clang-tools-extra] [libcxx] [mlir] [compiler-rt] [lld] GFX12: Add LoopDataPrefetchPass (PR #75625)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/75625 >From de5303eb8a9e061dbd365922f85cad02bca5ec26 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 5 Jul 2022 11:41:29 -0700 Subject: [PATCH 1/3] GFX12: Add LoopDataPrefetchPass It is currently disabled by default. It will need experiments on a real HW to tune and decide on the profitability. --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 + .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 18 ++ .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 10 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 4 + llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 8 +- .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 185 ++ 6 files changed, 231 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e8c04ecf39ba02..fdc2077868cf99 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -345,6 +345,11 @@ static cl::opt EnableImageIntrinsicOptimizer( cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), cl::Hidden); +static cl::opt +EnableLoopPrefetch("amdgpu-loop-prefetch", + cl::desc("Enable loop data prefetch on AMDGPU"), + cl::Hidden, cl::init(false)); + static cl::opt EnableMaxIlpSchedStrategy( "amdgpu-enable-max-ilp-scheduling-strategy", cl::desc("Enable scheduling strategy to maximize ILP for a single wave."), @@ -982,6 +987,8 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { } void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { + if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive)) +addPass(createLoopDataPrefetchPass()); addPass(createSeparateConstOffsetFromGEPPass()); // ReassociateGEPs exposes more opportunities for SLSR. See // the example in reassociate-geps-and-slsr.ll. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index f1da1a61bf4dd5..218c5b5cfdac87 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1345,3 +1345,21 @@ GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const { Cost.first += (Size + 255) / 256; return Cost; } + +unsigned GCNTTIImpl::getPrefetchDistance() const { + return ST->hasPrefetch() ? 128 : 0; +} + +bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { + switch (AS) { + case AMDGPUAS::FLAT_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: +return true; + default: +break; + } + + return false; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 1e6c5bbfc0d75b..cd8e9fd10bbf21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -254,6 +254,16 @@ class GCNTTIImpl final : public BasicTTIImplBase { InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind); + + /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12. + unsigned getCacheLineSize() const override { return 128; } + + /// How much before a load we should place the prefetch instruction. + /// This is currently measured in number of IR instructions. + unsigned getPrefetchDistance() const override; + + /// \return if target want to issue a prefetch in address space \p AS. + bool shouldPrefetchAddressSpace(unsigned AS) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 70ef1fff274a40..717f22fb69fdd3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -245,6 +245,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) return false; + // A mayLoad instruction without a def is not a load. Likely a prefetch. + if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs()) +return false; + if (isDS(Opc0) && isDS(Opc1)) { // FIXME: Handle this case: diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 488dbe2e3189bf..8b0b6263832243 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -4,7 +4,7 @@ ; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \ ; RUN: | FileCheck -match-full-lines -strict-whitespace -check-prefix=
[llvm] [lldb] [libc] [clang] [flang] [clang-tools-extra] [libcxx] [mlir] [compiler-rt] [lld] GFX12: Add LoopDataPrefetchPass (PR #75625)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/75625 >From de5303eb8a9e061dbd365922f85cad02bca5ec26 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 5 Jul 2022 11:41:29 -0700 Subject: [PATCH 1/4] GFX12: Add LoopDataPrefetchPass It is currently disabled by default. It will need experiments on a real HW to tune and decide on the profitability. --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 + .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 18 ++ .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 10 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 4 + llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 8 +- .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 185 ++ 6 files changed, 231 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e8c04ecf39ba02..fdc2077868cf99 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -345,6 +345,11 @@ static cl::opt EnableImageIntrinsicOptimizer( cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), cl::Hidden); +static cl::opt +EnableLoopPrefetch("amdgpu-loop-prefetch", + cl::desc("Enable loop data prefetch on AMDGPU"), + cl::Hidden, cl::init(false)); + static cl::opt EnableMaxIlpSchedStrategy( "amdgpu-enable-max-ilp-scheduling-strategy", cl::desc("Enable scheduling strategy to maximize ILP for a single wave."), @@ -982,6 +987,8 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { } void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { + if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive)) +addPass(createLoopDataPrefetchPass()); addPass(createSeparateConstOffsetFromGEPPass()); // ReassociateGEPs exposes more opportunities for SLSR. See // the example in reassociate-geps-and-slsr.ll. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index f1da1a61bf4dd5..218c5b5cfdac87 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1345,3 +1345,21 @@ GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const { Cost.first += (Size + 255) / 256; return Cost; } + +unsigned GCNTTIImpl::getPrefetchDistance() const { + return ST->hasPrefetch() ? 128 : 0; +} + +bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { + switch (AS) { + case AMDGPUAS::FLAT_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: +return true; + default: +break; + } + + return false; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 1e6c5bbfc0d75b..cd8e9fd10bbf21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -254,6 +254,16 @@ class GCNTTIImpl final : public BasicTTIImplBase { InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind); + + /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12. + unsigned getCacheLineSize() const override { return 128; } + + /// How much before a load we should place the prefetch instruction. + /// This is currently measured in number of IR instructions. + unsigned getPrefetchDistance() const override; + + /// \return if target want to issue a prefetch in address space \p AS. + bool shouldPrefetchAddressSpace(unsigned AS) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 70ef1fff274a40..717f22fb69fdd3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -245,6 +245,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) return false; + // A mayLoad instruction without a def is not a load. Likely a prefetch. + if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs()) +return false; + if (isDS(Opc0) && isDS(Opc1)) { // FIXME: Handle this case: diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 488dbe2e3189bf..8b0b6263832243 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -4,7 +4,7 @@ ; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \ ; RUN: | FileCheck -match-full-lines -strict-whitespace -check-prefix=
[lldb] [clang] [compiler-rt] [libcxx] [flang] [lld] [clang-tools-extra] [mlir] [llvm] [libc] GFX12: Add LoopDataPrefetchPass (PR #75625)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/75625 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)
https://github.com/mariusz-sikora-at-amd created https://github.com/llvm/llvm-project/pull/75917 - image_atomic_pk_add_f16 - image_atomic_pk_add_bf16 - ds_pk_add_bf16 - ds_pk_add_f16 - ds_pk_add_rtn_bf16 - ds_pk_add_rtn_f16 - flat_atomic_pk_add_f16 - flat_atomic_pk_add_bf16 - global_atomic_pk_add_f16 - global_atomic_pk_add_bf16 - buffer_atomic_pk_add_f16 - buffer_atomic_pk_add_bf16 >From f0920d06a57b3bc77b50baf94c4616be597e74c3 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Mon, 18 Dec 2023 20:08:18 +0100 Subject: [PATCH] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions - image_atomic_pk_add_f16 - image_atomic_pk_add_bf16 - ds_pk_add_bf16 - ds_pk_add_f16 - ds_pk_add_rtn_bf16 - ds_pk_add_rtn_f16 - flat_atomic_pk_add_f16 - flat_atomic_pk_add_bf16 - global_atomic_pk_add_f16 - global_atomic_pk_add_bf16 - buffer_atomic_pk_add_f16 - buffer_atomic_pk_add_bf16 --- clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 +- .../builtins-fp-atomics-gfx12.cl | 92 llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 45 ++ llvm/lib/Target/AMDGPU/AMDGPU.td | 4 + llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 26 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 4 + llvm/lib/Target/AMDGPU/BUFInstructions.td | 21 + llvm/lib/Target/AMDGPU/DSInstructions.td | 12 +- llvm/lib/Target/AMDGPU/FLATInstructions.td| 4 + llvm/lib/Target/AMDGPU/MIMGInstructions.td| 2 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 + llvm/lib/Target/AMDGPU/SIInstructions.td | 1 + llvm/lib/TargetParser/TargetParser.cpp| 4 + .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 433 ++ .../AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll | 60 +++ llvm/test/MC/AMDGPU/gfx11_unsupported.s | 18 + llvm/test/MC/AMDGPU/gfx12_asm_ds.s| 75 +++ llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s | 132 ++ llvm/test/MC/AMDGPU/gfx12_asm_vflat.s | 60 +++ llvm/test/MC/AMDGPU/gfx12_asm_vimage.s| 54 +++ .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt | 60 +++ .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt | 84 .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt | 60 +++ .../Disassembler/AMDGPU/gfx12_dasm_vimage.txt | 54 +++ 29 files changed, 1329 insertions(+), 21 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl create mode 100644 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 8959634572b44e..fe1798406967e8 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -100,8 +100,8 @@ // GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1200: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1201: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx1
[clang-tools-extra] [llvm] [AMDGPU] Quit PromoteAllocaToVector if intrinsic is used (PR #68744)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/68744 >From 3dc3a43193247015933392b7da76c4ef77268231 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Tue, 10 Oct 2023 21:50:48 +0200 Subject: [PATCH 1/2] [AMDGPU] Bail if assume-like intrinsic is used in PromoteAllocaToVector Attached test will cause crash without this change. --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 ++ .../test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll | 9 + 2 files changed, 11 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 3707a960211eb4..8ec7d29e00c939 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -772,6 +772,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { // Ignore assume-like intrinsics and comparisons used in assumes. if (isAssumeLikeIntrinsic(Inst)) { + if (!Inst->use_empty()) +return RejectUser(Inst, "assume-like intrinsic cannot have any users"); UsersToRemove.push_back(Inst); continue; } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll index 0bba1bdce95655..5616bc0f5ef3c1 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll @@ -53,6 +53,15 @@ define amdgpu_kernel void @promote_with_objectsize(ptr addrspace(1) %out) #0 { ret void } +; CHECK-LABEL: @promote_with_objectsize_8( +; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [8 x i32]], ptr addrspace(3) @promote_with_objectsize_8.alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: call i32 @llvm.objectsize.i32.p3(ptr addrspace(3) [[PTR]], i1 false, i1 false, i1 false) +define amdgpu_kernel void @promote_with_objectsize_8(ptr addrspace(1) %out) #0 { + %alloca = alloca [8 x i32], align 4, addrspace(5) + %size = call i32 @llvm.objectsize.i32.p5(ptr addrspace(5) %alloca, i1 false, i1 false, i1 false) + store i32 %size, ptr addrspace(1) %out + ret void +} ; CHECK-LABEL: @promote_alloca_used_twice_in_memcpy( ; CHECK: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(3) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false) define amdgpu_kernel void @promote_alloca_used_twice_in_memcpy(i32 %c) { >From a85ab29fe8f338ff740456ea6b8890dd1212ca6c Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Tue, 21 Nov 2023 09:18:45 +0100 Subject: [PATCH 2/2] Fixup - handle objectsize in PromoteAlloca --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 16 .../AMDGPU/promote-alloca-mem-intrinsics.ll | 3 +-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 8ec7d29e00c939..90d0dc96898758 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -528,6 +528,15 @@ static Value *promoteAllocaUserToVector( return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt); } +if (auto *Intr = dyn_cast(Inst)) { + if (Intr->getIntrinsicID() == Intrinsic::objectsize) { +Intr->replaceAllUsesWith( +Builder.getIntN(Intr->getType()->getIntegerBitWidth(), +DL.getTypeAllocSize(VectorTy))); +return nullptr; + } +} + llvm_unreachable("Unsupported call when promoting alloca to vector"); } @@ -770,6 +779,13 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { continue; } +if (auto *Intr = dyn_cast(Inst)) { + if (Intr->getIntrinsicID() == Intrinsic::objectsize) { +WorkList.push_back(Inst); +continue; + } +} + // Ignore assume-like intrinsics and comparisons used in assumes. if (isAssumeLikeIntrinsic(Inst)) { if (!Inst->use_empty()) diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll index 5616bc0f5ef3c1..aabd5df9568370 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll @@ -54,8 +54,7 @@ define amdgpu_kernel void @promote_with_objectsize(ptr addrspace(1) %out) #0 { } ; CHECK-LABEL: @promote_with_objectsize_8( -; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [8 x i32]], ptr addrspace(3) @promote_with_objectsize_8.alloca, i32 0, i32 %{{[0-9]+}} -; CHECK: call i32 @llvm.objectsize.i32.p3(ptr addrspace(3) [[PTR]], i1 false, i1 false, i1 false) +; CHECK: store i32 32, ptr addrspace(1) %out, align 4 define amdgpu_kernel void @promote_with_objectsize_8(ptr addrspace(1) %out) #0 { %alloca = alloca [8 x i32], a
[llvm] [clang] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)
@@ -0,0 +1,92 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \ +// RUN: %s -S -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \ mariusz-sikora-at-amd wrote: What do you suggest ? I copied this test from other builtins-fp-atomics-gfxXX. I thought this is a good test which covers both llvm intrinsic and ISA generation. https://github.com/llvm/llvm-project/pull/75917 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [AMDGPU] Quit PromoteAllocaToVector if intrinsic is used (PR #68744)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/68744 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [AMDGPU][GFX12] Default component broadcast store (PR #76212)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/76212 >From 06117c6124e94953f62eff3b1b87d98146f9e25e Mon Sep 17 00:00:00 2001 From: Mateja Marjanovic Date: Wed, 10 May 2023 16:24:38 +0200 Subject: [PATCH 1/2] [AMDGPU][GFX12] Default component broadcast store For image and buffer stores the default behaviour on GFX12 is to set all unset components to the value of the first component. So if we pass only X component, it will be the same as , or XY same as XYXX. This patch simplifies the passed vector of components in InstCombine by removing components from the end that are equal to the first component. For image stores it also trims DMask if necessary. --- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 36 +-- .../amdgcn-simplify-image-buffer-stores.ll| 32 - 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 2bb7b6bd0674a2..da2f862308558b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -402,6 +402,35 @@ static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, return DemandedElts; } +// Trim elements of the end of the vector \p V, if they are +// equal to the first element of the vector. +static APInt defaultComponentBroadcast(Value *V) { + auto *VTy = cast(V->getType()); + unsigned VWidth = VTy->getNumElements(); + APInt DemandedElts = APInt::getAllOnes(VWidth); + Value *FirstComponent = findScalarElement(V, 0); + + SmallVector ShuffleMask; + if (auto *SVI = dyn_cast(V)) +SVI->getShuffleMask(ShuffleMask); + + for (int I = VWidth - 1; I > 0; --I) { +if (ShuffleMask.empty()) { + auto *Elt = findScalarElement(V, I); + if (!Elt || (Elt != FirstComponent && !isa(Elt))) +break; +} else { + // Detect identical elements in the shufflevector result, even though + // findScalarElement cannot tell us what that element is. + if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem) +break; +} +DemandedElts.clearBit(I); + } + + return DemandedElts; +} + static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, @@ -1140,8 +1169,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (!isa(II.getArgOperand(0)->getType())) break; -APInt DemandedElts = -trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); +APInt DemandedElts; +if (AMDGPU::isGFX12Plus(*ST)) + DemandedElts = defaultComponentBroadcast(II.getArgOperand(0)); +else + DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1; if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx, diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll index f2d904cce7f00d..95b1d09bbd6036 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll @@ -23,7 +23,8 @@ define amdgpu_ps void @image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg ; GCN-NEXT:ret void ; ; GFX12-LABEL: @image_store_1d_store_insert_zeros_at_end( -; GFX12-NEXT:call void @llvm.amdgcn.image.store.1d.f32.i32(float [[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GFX12-NEXT:[[NEWVDATA4:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFX12-NEXT:call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; GFX12-NEXT:ret void ; %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 @@ -63,9 +64,9 @@ define amdgpu_ps void @buffer_store_format_insert_zeros_at_end(<4 x i32> inreg % ; GCN-NEXT:ret void ; ; GFX12-LABEL: @buffer_store_format_insert_zeros_at_end( -; GFX12-NEXT:[[TMP1:%.*]] = insertelement <2 x float> poison, float [[VDATA1:%.*]], i64 0 -; GFX12-NEXT:[[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer -; GFX12-NEXT:call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false) +; GFX12-NEXT:[[TMP1:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFX12-NEXT:[[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1 +; GFX12-NEXT:call void @llvm.amdgcn.buffer.st
[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/76224 >From 89b94cc98e188142cff11d58f27fe6c25183b376 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Thu, 21 Dec 2023 11:58:47 +0100 Subject: [PATCH 1/2] [AMDGPU][GFX12] Add Atomic cond_sub_u32 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 19 +- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 4 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 7 + llvm/lib/Target/AMDGPU/BUFInstructions.td | 14 + llvm/lib/Target/AMDGPU/DSInstructions.td | 27 +- llvm/lib/Target/AMDGPU/FLATInstructions.td| 31 +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 + llvm/lib/Target/AMDGPU/SIInstructions.td | 1 + llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 254 ++ .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 171 llvm/test/MC/AMDGPU/gfx11_unsupported.s | 12 + llvm/test/MC/AMDGPU/gfx12_asm_ds.s| 18 ++ llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s | 66 + llvm/test/MC/AMDGPU/gfx12_asm_vflat.s | 36 +++ .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt | 81 ++ .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt | 42 +++ .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt | 18 ++ 23 files changed, 812 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index cb48f54b13a6cd..2d066350ee9f84 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -10,6 +10,10 @@ // //===--===// +def flat_ptr_ty : LLVMQualPointerType<0>; +def global_ptr_ty : LLVMQualPointerType<1>; +def local_ptr_ty : LLVMQualPointerType<3>; + class AMDGPUReadPreloadRegisterIntrinsic : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; @@ -1243,6 +1247,7 @@ def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1279,6 +1284,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; +def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1317,6 +1323,7 @@ def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1352,6 +1359,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; +def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -2353,10 +2361,10 @@ def int_amdgcn_s_get_waveid_in_workgroup : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; -class AMDGPUAtomicRtn : Intrinsic < +class AMDGPUAtomicRtn : Intrinsic < [vt], - [llvm_anyptr_ty,// vaddr - vt], // vdata(VGPR) + [pt, // vaddr + vt], // vdata(VGPR) [IntrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; @@ -2491,6 +2499,11 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn; def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;
[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
@@ -2502,10 +2500,9 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn; def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn; def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn; -def int_amdgcn_flat_atomic_cond_sub_u32 : AMDGPUAtomicRtn; -def int_amdgcn_global_atomic_cond_sub_u32 : AMDGPUAtomicRtn; - -def int_amdgcn_ds_cond_sub_u32 : AMDGPUAtomicRtn; +def int_amdgcn_flat_atomic_cond_sub_u32 : AMDGPUAtomicRtn; +def int_amdgcn_global_atomic_cond_sub_u32 : AMDGPUAtomicRtn; +def int_amdgcn_ds_cond_sub_u32: AMDGPUAtomicRtn; mariusz-sikora-at-amd wrote: @arsenm is this what you were expecting ? https://github.com/llvm/llvm-project/pull/76224 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
mariusz-sikora-at-amd wrote: Adding support in atomicrmw. This will require to add new operation to aromicrmw "cond_sub" or you had something else in mind @arsenm ? https://github.com/llvm/llvm-project/pull/76224 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Default component broadcast store (PR #76212)
mariusz-sikora-at-amd wrote: ping @arsenm https://github.com/llvm/llvm-project/pull/76212 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [clang] [llvm] [AMDGPU][GFX12] Default component broadcast store (PR #76212)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/76212 >From 06117c6124e94953f62eff3b1b87d98146f9e25e Mon Sep 17 00:00:00 2001 From: Mateja Marjanovic Date: Wed, 10 May 2023 16:24:38 +0200 Subject: [PATCH 1/2] [AMDGPU][GFX12] Default component broadcast store For image and buffer stores the default behaviour on GFX12 is to set all unset components to the value of the first component. So if we pass only X component, it will be the same as , or XY same as XYXX. This patch simplifies the passed vector of components in InstCombine by removing components from the end that are equal to the first component. For image stores it also trims DMask if necessary. --- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 36 +-- .../amdgcn-simplify-image-buffer-stores.ll| 32 - 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 2bb7b6bd0674a2..da2f862308558b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -402,6 +402,35 @@ static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, return DemandedElts; } +// Trim elements of the end of the vector \p V, if they are +// equal to the first element of the vector. +static APInt defaultComponentBroadcast(Value *V) { + auto *VTy = cast(V->getType()); + unsigned VWidth = VTy->getNumElements(); + APInt DemandedElts = APInt::getAllOnes(VWidth); + Value *FirstComponent = findScalarElement(V, 0); + + SmallVector ShuffleMask; + if (auto *SVI = dyn_cast(V)) +SVI->getShuffleMask(ShuffleMask); + + for (int I = VWidth - 1; I > 0; --I) { +if (ShuffleMask.empty()) { + auto *Elt = findScalarElement(V, I); + if (!Elt || (Elt != FirstComponent && !isa(Elt))) +break; +} else { + // Detect identical elements in the shufflevector result, even though + // findScalarElement cannot tell us what that element is. + if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem) +break; +} +DemandedElts.clearBit(I); + } + + return DemandedElts; +} + static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, @@ -1140,8 +1169,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (!isa(II.getArgOperand(0)->getType())) break; -APInt DemandedElts = -trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); +APInt DemandedElts; +if (AMDGPU::isGFX12Plus(*ST)) + DemandedElts = defaultComponentBroadcast(II.getArgOperand(0)); +else + DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1; if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx, diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll index f2d904cce7f00d..95b1d09bbd6036 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll @@ -23,7 +23,8 @@ define amdgpu_ps void @image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg ; GCN-NEXT:ret void ; ; GFX12-LABEL: @image_store_1d_store_insert_zeros_at_end( -; GFX12-NEXT:call void @llvm.amdgcn.image.store.1d.f32.i32(float [[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GFX12-NEXT:[[NEWVDATA4:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFX12-NEXT:call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; GFX12-NEXT:ret void ; %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 @@ -63,9 +64,9 @@ define amdgpu_ps void @buffer_store_format_insert_zeros_at_end(<4 x i32> inreg % ; GCN-NEXT:ret void ; ; GFX12-LABEL: @buffer_store_format_insert_zeros_at_end( -; GFX12-NEXT:[[TMP1:%.*]] = insertelement <2 x float> poison, float [[VDATA1:%.*]], i64 0 -; GFX12-NEXT:[[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer -; GFX12-NEXT:call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false) +; GFX12-NEXT:[[TMP1:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFX12-NEXT:[[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1 +; GFX12-NEXT:call void @llvm.amdgcn.buffer.st
[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Default component broadcast store (PR #76212)
mariusz-sikora-at-amd wrote: Merge with upstream to run tests. I will merge this changes if CI will pass. https://github.com/llvm/llvm-project/pull/76212 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/78729 >From cc492d4134e4aa0aab56d01b21ec85937e49acfd Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Fri, 19 Jan 2024 16:29:46 +0100 Subject: [PATCH] [AMDGPU][GFX12] Add tests for unsupported builtins __builtin_amdgcn_mfma* and __builtin_amdgcn_smfmac* --- .../builtins-amdgcn-gfx12-err.cl | 86 ++- 1 file changed, 85 insertions(+), 1 deletion(-) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl index bcaea9a2482d1..f91fea1714510 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl @@ -4,10 +4,94 @@ typedef unsigned int uint; -kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) { +#pragma OPENCL EXTENSION cl_khr_fp64:enable + +typedef float v2f __attribute__((ext_vector_type(2))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef float v16f __attribute__((ext_vector_type(16))); +typedef float v32f __attribute__((ext_vector_type(32))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef half v32h __attribute__((ext_vector_type(32))); +typedef intv2i __attribute__((ext_vector_type(2))); +typedef intv4i __attribute__((ext_vector_type(4))); +typedef intv16i __attribute__((ext_vector_type(16))); +typedef intv32i __attribute__((ext_vector_type(32))); +typedef short v2s __attribute__((ext_vector_type(2))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef short v16s __attribute__((ext_vector_type(16))); +typedef short v32s __attribute__((ext_vector_type(32))); +typedef double v4d __attribute__((ext_vector_type(4))); + +void builtin_test_unsupported(double a_double, float a_float, + int a_int, long a_long, + v4d a_v4d, + v2s a_v2s, v4s a_v4s, v8s a_v8s, + v2i a_v2i, v4i a_v4i, v16i a_v16i, v32i a_v32i, + v2f a_v2f, v4f a_v4f, v16f a_v16f, v32f a_v32f, + v4h a_v4h, v8h a_v8h, + + uint a, uint b) { + __builtin_amdgcn_ds_gws_init(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_init' needs target feature gws}} __builtin_amdgcn_ds_gws_barrier(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_barrier' needs target feature gws}} __builtin_amdgcn_ds_gws_sema_v(a); // expected-error {{'__builtin_amdgcn_ds_gws_sema_v' needs target feature gws}} __builtin_amdgcn_ds_gws_sema_br(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_sema_br' needs target feature gws}} __builtin_amdgcn_ds_gws_sema_p(a); // expected-error {{'__builtin_amdgcn_ds_gws_sema_p' needs target feature gws}} + + a_v32f = __builtin_amdgcn_mfma_f32_32x32x1f32(a_float, a_float, a_v32f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x1f32' needs target feature mai-insts}} + a_v16f = __builtin_amdgcn_mfma_f32_16x16x1f32(a_float, a_float, a_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x1f32' needs target feature mai-insts}} + a_v4f = __builtin_amdgcn_mfma_f32_4x4x1f32(a_float, a_float, a_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_4x4x1f32' needs target feature mai-insts}} + a_v16f = __builtin_amdgcn_mfma_f32_32x32x2f32(a_float, a_float, a_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x2f32' needs target feature mai-insts}} + a_v4f = __builtin_amdgcn_mfma_f32_16x16x4f32(a_float, a_float, a_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f32' needs target feature mai-insts}} + a_v32f = __builtin_amdgcn_mfma_f32_32x32x4f16(a_v4h, a_v4h, a_v32f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x4f16' needs target feature mai-insts}} + a_v16f = __builtin_amdgcn_mfma_f32_16x16x4f16(a_v4h, a_v4h, a_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f16' needs target feature mai-insts}} + a_v4f = __builtin_amdgcn_mfma_f32_4x4x4f16(a_v4h, a_v4h, a_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_4x4x4f16' needs target feature mai-insts}} + a_v16f = __builtin_amdgcn_mfma_f32_32x32x8f16(a_v4h, a_v4h, a_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x8f16' needs target feature mai-insts}} + a_v4f = __builtin_amdgcn_mfma_f32_16x16x16f16(a_v4h, a_v4h, a_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x16f16' needs target feature mai-insts}} + a_v32i = __builtin_amdgcn_mfma_i32_32x32x4i8(a_int, a_int, a_v32i, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_i32_32x32x4i8' needs targ
[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/78729 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[lldb] [flang] [clang-tools-extra] [libcxx] [clang] [lld] [llvm] [compiler-rt] [libc] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)
Mirko =?utf-8?q?Brkušanin?= , Mirko =?utf-8?q?Brkušanin?= ,Mirko Brkusanin ,Mariusz Sikora Message-ID: In-Reply-To: @@ -8770,6 +8781,22 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, } } +int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); +if (VdstInIdx == static_cast(Inst.getNumOperands())) { + Inst.addOperand(Inst.getOperand(0)); +} + +bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12; mariusz-sikora-at-amd wrote: Thanks, I will prepare different PRs to cover this and what Joe pointed out. https://github.com/llvm/llvm-project/pull/78414 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)
mariusz-sikora-at-amd wrote: ping https://github.com/llvm/llvm-project/pull/78729 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)
@@ -4,10 +4,114 @@ typedef unsigned int uint; -kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) { +#pragma OPENCL EXTENSION cl_khr_fp64:enable + +typedef float v2f __attribute__((ext_vector_type(2))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef float v16f __attribute__((ext_vector_type(16))); +typedef float v32f __attribute__((ext_vector_type(32))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef half v32h __attribute__((ext_vector_type(32))); +typedef intv2i __attribute__((ext_vector_type(2))); +typedef intv4i __attribute__((ext_vector_type(4))); +typedef intv16i __attribute__((ext_vector_type(16))); +typedef intv32i __attribute__((ext_vector_type(32))); +typedef short v2s __attribute__((ext_vector_type(2))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef short v16s __attribute__((ext_vector_type(16))); +typedef short v32s __attribute__((ext_vector_type(32))); +typedef double v4d __attribute__((ext_vector_type(4))); + +void builtin_test_unsupported(global v32f*out_v32f, + global v16f*out_v16f, + global v4f* out_v4f, + global v32i*out_v32i, + global v16i*out_v16i, + global v4i* out_v4i, + global v4d* out_v4d, + global double* out_double, + double a_double , double b_double , double c_double, mariusz-sikora-at-amd wrote: Thanks, I will update these. https://github.com/llvm/llvm-project/pull/78729 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [libc] [compiler-rt] [libcxx] [lldb] [pstl] [flang] [clang-tools-extra] [mlir] [openmp] [lld] [llvm] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)
Mirko =?utf-8?q?Brkušanin?= , Mirko =?utf-8?q?Brkušanin?= ,Mirko Brkusanin ,Mariusz Sikora Message-ID: In-Reply-To: https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/78414 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/78729 >From 56cf06f1b530d5ec62de1cc3818bf2f76dfd Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Fri, 19 Jan 2024 16:29:46 +0100 Subject: [PATCH] [AMDGPU][GFX12] Add tests for unsupported builtins __builtin_amdgcn_mfma* and __builtin_amdgcn_smfmac* --- .../builtins-amdgcn-gfx12-err.cl | 86 ++- 1 file changed, 85 insertions(+), 1 deletion(-) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl index bcaea9a2482d186..f91fea17145102a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl @@ -4,10 +4,94 @@ typedef unsigned int uint; -kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) { +#pragma OPENCL EXTENSION cl_khr_fp64:enable + +typedef float v2f __attribute__((ext_vector_type(2))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef float v16f __attribute__((ext_vector_type(16))); +typedef float v32f __attribute__((ext_vector_type(32))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef half v32h __attribute__((ext_vector_type(32))); +typedef intv2i __attribute__((ext_vector_type(2))); +typedef intv4i __attribute__((ext_vector_type(4))); +typedef intv16i __attribute__((ext_vector_type(16))); +typedef intv32i __attribute__((ext_vector_type(32))); +typedef short v2s __attribute__((ext_vector_type(2))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef short v16s __attribute__((ext_vector_type(16))); +typedef short v32s __attribute__((ext_vector_type(32))); +typedef double v4d __attribute__((ext_vector_type(4))); + +void builtin_test_unsupported(double a_double, float a_float, + int a_int, long a_long, + v4d a_v4d, + v2s a_v2s, v4s a_v4s, v8s a_v8s, + v2i a_v2i, v4i a_v4i, v16i a_v16i, v32i a_v32i, + v2f a_v2f, v4f a_v4f, v16f a_v16f, v32f a_v32f, + v4h a_v4h, v8h a_v8h, + + uint a, uint b) { + __builtin_amdgcn_ds_gws_init(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_init' needs target feature gws}} __builtin_amdgcn_ds_gws_barrier(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_barrier' needs target feature gws}} __builtin_amdgcn_ds_gws_sema_v(a); // expected-error {{'__builtin_amdgcn_ds_gws_sema_v' needs target feature gws}} __builtin_amdgcn_ds_gws_sema_br(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_sema_br' needs target feature gws}} __builtin_amdgcn_ds_gws_sema_p(a); // expected-error {{'__builtin_amdgcn_ds_gws_sema_p' needs target feature gws}} + + a_v32f = __builtin_amdgcn_mfma_f32_32x32x1f32(a_float, a_float, a_v32f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x1f32' needs target feature mai-insts}} + a_v16f = __builtin_amdgcn_mfma_f32_16x16x1f32(a_float, a_float, a_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x1f32' needs target feature mai-insts}} + a_v4f = __builtin_amdgcn_mfma_f32_4x4x1f32(a_float, a_float, a_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_4x4x1f32' needs target feature mai-insts}} + a_v16f = __builtin_amdgcn_mfma_f32_32x32x2f32(a_float, a_float, a_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x2f32' needs target feature mai-insts}} + a_v4f = __builtin_amdgcn_mfma_f32_16x16x4f32(a_float, a_float, a_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f32' needs target feature mai-insts}} + a_v32f = __builtin_amdgcn_mfma_f32_32x32x4f16(a_v4h, a_v4h, a_v32f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x4f16' needs target feature mai-insts}} + a_v16f = __builtin_amdgcn_mfma_f32_16x16x4f16(a_v4h, a_v4h, a_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f16' needs target feature mai-insts}} + a_v4f = __builtin_amdgcn_mfma_f32_4x4x4f16(a_v4h, a_v4h, a_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_4x4x4f16' needs target feature mai-insts}} + a_v16f = __builtin_amdgcn_mfma_f32_32x32x8f16(a_v4h, a_v4h, a_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x8f16' needs target feature mai-insts}} + a_v4f = __builtin_amdgcn_mfma_f32_16x16x16f16(a_v4h, a_v4h, a_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x16f16' needs target feature mai-insts}} + a_v32i = __builtin_amdgcn_mfma_i32_32x32x4i8(a_int, a_int, a_v32i, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_i32_32x32x4i8' needs
[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/78729 >From 19e0554bcebf739f7ad500f64efe62b38781f7a1 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Fri, 19 Jan 2024 16:29:46 +0100 Subject: [PATCH] [AMDGPU][GFX12] Add tests for unsupported builtins __builtin_amdgcn_mfma* and __builtin_amdgcn_smfmac* --- .../builtins-amdgcn-gfx12-err.cl | 86 ++- 1 file changed, 85 insertions(+), 1 deletion(-) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl index bcaea9a2482d186..f91fea17145102a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl @@ -4,10 +4,94 @@ typedef unsigned int uint; -kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) { +#pragma OPENCL EXTENSION cl_khr_fp64:enable + +typedef float v2f __attribute__((ext_vector_type(2))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef float v16f __attribute__((ext_vector_type(16))); +typedef float v32f __attribute__((ext_vector_type(32))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef half v32h __attribute__((ext_vector_type(32))); +typedef intv2i __attribute__((ext_vector_type(2))); +typedef intv4i __attribute__((ext_vector_type(4))); +typedef intv16i __attribute__((ext_vector_type(16))); +typedef intv32i __attribute__((ext_vector_type(32))); +typedef short v2s __attribute__((ext_vector_type(2))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef short v16s __attribute__((ext_vector_type(16))); +typedef short v32s __attribute__((ext_vector_type(32))); +typedef double v4d __attribute__((ext_vector_type(4))); + +void builtin_test_unsupported(double a_double, float a_float, + int a_int, long a_long, + v4d a_v4d, + v2s a_v2s, v4s a_v4s, v8s a_v8s, + v2i a_v2i, v4i a_v4i, v16i a_v16i, v32i a_v32i, + v2f a_v2f, v4f a_v4f, v16f a_v16f, v32f a_v32f, + v4h a_v4h, v8h a_v8h, + + uint a, uint b) { + __builtin_amdgcn_ds_gws_init(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_init' needs target feature gws}} __builtin_amdgcn_ds_gws_barrier(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_barrier' needs target feature gws}} __builtin_amdgcn_ds_gws_sema_v(a); // expected-error {{'__builtin_amdgcn_ds_gws_sema_v' needs target feature gws}} __builtin_amdgcn_ds_gws_sema_br(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_sema_br' needs target feature gws}} __builtin_amdgcn_ds_gws_sema_p(a); // expected-error {{'__builtin_amdgcn_ds_gws_sema_p' needs target feature gws}} + + a_v32f = __builtin_amdgcn_mfma_f32_32x32x1f32(a_float, a_float, a_v32f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x1f32' needs target feature mai-insts}} + a_v16f = __builtin_amdgcn_mfma_f32_16x16x1f32(a_float, a_float, a_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x1f32' needs target feature mai-insts}} + a_v4f = __builtin_amdgcn_mfma_f32_4x4x1f32(a_float, a_float, a_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_4x4x1f32' needs target feature mai-insts}} + a_v16f = __builtin_amdgcn_mfma_f32_32x32x2f32(a_float, a_float, a_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x2f32' needs target feature mai-insts}} + a_v4f = __builtin_amdgcn_mfma_f32_16x16x4f32(a_float, a_float, a_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f32' needs target feature mai-insts}} + a_v32f = __builtin_amdgcn_mfma_f32_32x32x4f16(a_v4h, a_v4h, a_v32f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x4f16' needs target feature mai-insts}} + a_v16f = __builtin_amdgcn_mfma_f32_16x16x4f16(a_v4h, a_v4h, a_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f16' needs target feature mai-insts}} + a_v4f = __builtin_amdgcn_mfma_f32_4x4x4f16(a_v4h, a_v4h, a_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_4x4x4f16' needs target feature mai-insts}} + a_v16f = __builtin_amdgcn_mfma_f32_32x32x8f16(a_v4h, a_v4h, a_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x8f16' needs target feature mai-insts}} + a_v4f = __builtin_amdgcn_mfma_f32_16x16x16f16(a_v4h, a_v4h, a_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x16f16' needs target feature mai-insts}} + a_v32i = __builtin_amdgcn_mfma_i32_32x32x4i8(a_int, a_int, a_v32i, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_i32_32x32x4i8' needs
[llvm] [clang-tools-extra] [clang] [AMDGPU][GFX12] Default component broadcast store (PR #76212)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/76212 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)
https://github.com/mariusz-sikora-at-amd created https://github.com/llvm/llvm-project/pull/77892 Endoding is VOP3P. Tagged as deep/machine learning instructions. i32 type (v4fp8 or v4bf8 packed in i32) is used for src0 and src1. src0 and src1 have no src_modifiers. src2 is f32 and has src_modifiers: f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]). >From 628a3d2b42cdcbd903e0830ab7d631ea7dc422b9 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 10 Jan 2024 12:17:58 +0100 Subject: [PATCH] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions Endoding is VOP3P. Tagged as deep/machine learning instructions. i32 type (v4fp8 or v4bf8 packed in i32) is used for src0 and src1. src0 and src1 have no src_modifiers. src2 is f32 and has src_modifiers: f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]). --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 + .../builtins-amdgcn-dl-insts-err.cl | 5 + .../builtins-amdgcn-dl-insts-gfx12.cl | 20 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 19 ++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 4 + .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 46 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 17 +- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 47 llvm/lib/Target/AMDGPU/VOPInstructions.td | 13 +- .../CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll | 255 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s | 120 + llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s | 24 ++ .../MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s | 24 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s| 24 ++ .../test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s | 27 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s | 133 + .../Disassembler/AMDGPU/gfx12_dasm_vop3p.txt | 120 + .../AMDGPU/gfx12_dasm_vop3p_dpp16.txt | 24 ++ .../AMDGPU/gfx12_dasm_vop3p_dpp8.txt | 24 ++ 19 files changed, 938 insertions(+), 12 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e562ef04a30194..1c1b9b2c9e9e8c 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -255,6 +255,10 @@ TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", "nc", "dot8-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts") TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts") TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_bf8, "fUiUif", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_fp8, "fUiUif", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_fp8, "fUiUif", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_bf8, "fUiUif", "nc", "gfx12-insts") //===--===// // GFX10+ only builtins. diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl index 6573325150d958..1be47f71276208 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl @@ -49,4 +49,9 @@ kernel void builtins_amdgcn_dl_insts_err( iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false);// expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}} iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}} + + fOut[5] = __builtin_amdgcn_fdot4_f32_fp8_bf8(uiA, uiB, fC);// expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_bf8' needs target feature gfx12-insts}} + fOut[6] = __builtin_amdgcn_fdot4_f32_bf8_fp8(uiA, uiB, fC);// expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_fp8' needs target feature gfx12-insts}} + fOut[7] = __builtin_amdgcn_fdot4_f32_fp8_fp8(uiA, uiB, fC);// expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_fp8' needs target feature gfx12-insts}} + fOut[8] = __builtin_amdgcn_fdot4_f32_bf8_bf8(uiA, uiB, fC);// expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_bf8' needs target feature gfx12-insts}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl new file mode 100644 index 00..31e10c0a5dc18c --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl @@ -0,0 +1,20 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1
[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)
@@ -0,0 +1,92 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \ +// RUN: %s -S -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX12 %s + mariusz-sikora-at-amd wrote: Added here: https://github.com/llvm/llvm-project/blob/main/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl https://github.com/llvm/llvm-project/pull/75917 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)
@@ -362,24 +358,34 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 ret void } -define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX12-LABEL: struct_buffer_atomic_add_v2bf16_ret: ; GFX12: ; %bb.0: ; GFX12-NEXT:buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX12-NEXT:v_mov_b32_e32 v1, 0 +; GFX12-NEXT:v_mov_b32_e32 v2, 0 ; GFX12-NEXT:s_waitcnt vmcnt(0) +; GFX12-NEXT:flat_store_b32 v[1:2], v0 +; GFX12-NEXT:v_mov_b32_e32 v0, 1.0 +; GFX12-NEXT:s_waitcnt lgkmcnt(0) ; GFX12-NEXT:; return to shader part epilog ; ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT:buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT:v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT:v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT:s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT:flat_store_b32 v[1:2], v0 +; GFX12-GISEL-NEXT:v_mov_b32_e32 v0, 1.0 +; GFX12-GISEL-NEXT:s_waitcnt lgkmcnt(0) ; GFX12-GISEL-NEXT:; return to shader part epilog - %orig = call <2 x i16> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) - %r = bitcast <2 x i16> %orig to float mariusz-sikora-at-amd wrote: Found issue in GlobalISel and bitcast with bfloat type. I prepare fix and push in different change. https://github.com/llvm/llvm-project/pull/75917 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)
@@ -27,34 +27,23 @@ main_body: ret float %out0 } -define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x i16> %data, i32 %s) { +define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x bfloat> %data, i32 %s) { ; GFX12-LABEL: atomic_pk_add_bf16_1d_v2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT:image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT:v_mov_b32_e32 v1, 0 +; GFX12-NEXT:v_mov_b32_e32 v2, 0 ; GFX12-NEXT:s_waitcnt vmcnt(0) +; GFX12-NEXT:flat_store_b32 v[1:2], v0 +; GFX12-NEXT:v_mov_b32_e32 v0, 1.0 +; GFX12-NEXT:s_waitcnt lgkmcnt(0) ; GFX12-NEXT:; return to shader part epilog main_body: - %out = call <2 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2i16.v2i16(<2 x i16> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) - %out_i32 = bitcast <2 x i16> %out to i32 - %out_float = bitcast i32 %out_i32 to float - ret float %out_float -} - -define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x i16> %data, i32 %s) { -; GFX12-LABEL: atomic_pk_add_bf16_1d_v4: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT:image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT:s_waitcnt vmcnt(0) -; GFX12-NEXT:; return to shader part epilog -main_body: - %out = call <4 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4i16.v4i16(<4 x i16> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) mariusz-sikora-at-amd wrote: Found issue with <4 x bfloat> and GlobalISel. I will try to debug this and prepare fix. https://github.com/llvm/llvm-project/pull/75917 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)
mariusz-sikora-at-amd wrote: What is the plan for atomic_{flat/ds/global}_bf16 builtins ? Right now they are accepting <2 x i16> instead of <2 x bfloat>. Do we want to create new builtins or we want to override them to accept both <2 x i16> and <2 x bfloat> ? https://github.com/llvm/llvm-project/pull/75917 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
mariusz-sikora-at-amd wrote: ping https://github.com/llvm/llvm-project/pull/76224 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/76224 >From 89b94cc98e188142cff11d58f27fe6c25183b376 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Thu, 21 Dec 2023 11:58:47 +0100 Subject: [PATCH 1/3] [AMDGPU][GFX12] Add Atomic cond_sub_u32 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 19 +- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 4 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 7 + llvm/lib/Target/AMDGPU/BUFInstructions.td | 14 + llvm/lib/Target/AMDGPU/DSInstructions.td | 27 +- llvm/lib/Target/AMDGPU/FLATInstructions.td| 31 +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 + llvm/lib/Target/AMDGPU/SIInstructions.td | 1 + llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 254 ++ .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 171 llvm/test/MC/AMDGPU/gfx11_unsupported.s | 12 + llvm/test/MC/AMDGPU/gfx12_asm_ds.s| 18 ++ llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s | 66 + llvm/test/MC/AMDGPU/gfx12_asm_vflat.s | 36 +++ .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt | 81 ++ .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt | 42 +++ .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt | 18 ++ 23 files changed, 812 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index cb48f54b13a6cd..2d066350ee9f84 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -10,6 +10,10 @@ // //===--===// +def flat_ptr_ty : LLVMQualPointerType<0>; +def global_ptr_ty : LLVMQualPointerType<1>; +def local_ptr_ty : LLVMQualPointerType<3>; + class AMDGPUReadPreloadRegisterIntrinsic : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; @@ -1243,6 +1247,7 @@ def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1279,6 +1284,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; +def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1317,6 +1323,7 @@ def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1352,6 +1359,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; +def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -2353,10 +2361,10 @@ def int_amdgcn_s_get_waveid_in_workgroup : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; -class AMDGPUAtomicRtn : Intrinsic < +class AMDGPUAtomicRtn : Intrinsic < [vt], - [llvm_anyptr_ty,// vaddr - vt], // vdata(VGPR) + [pt, // vaddr + vt], // vdata(VGPR) [IntrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; @@ -2491,6 +2499,11 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn; def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;
[llvm] [clang] [clang-tools-extra] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
mariusz-sikora-at-amd wrote: > Missing UniformityAnalysis test for these Done https://github.com/llvm/llvm-project/pull/76224 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/77892 >From 628a3d2b42cdcbd903e0830ab7d631ea7dc422b9 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 10 Jan 2024 12:17:58 +0100 Subject: [PATCH 1/2] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions Endoding is VOP3P. Tagged as deep/machine learning instructions. i32 type (v4fp8 or v4bf8 packed in i32) is used for src0 and src1. src0 and src1 have no src_modifiers. src2 is f32 and has src_modifiers: f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]). --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 + .../builtins-amdgcn-dl-insts-err.cl | 5 + .../builtins-amdgcn-dl-insts-gfx12.cl | 20 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 19 ++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 4 + .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 46 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 17 +- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 47 llvm/lib/Target/AMDGPU/VOPInstructions.td | 13 +- .../CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll | 255 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s | 120 + llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s | 24 ++ .../MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s | 24 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s| 24 ++ .../test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s | 27 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s | 133 + .../Disassembler/AMDGPU/gfx12_dasm_vop3p.txt | 120 + .../AMDGPU/gfx12_dasm_vop3p_dpp16.txt | 24 ++ .../AMDGPU/gfx12_dasm_vop3p_dpp8.txt | 24 ++ 19 files changed, 938 insertions(+), 12 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e562ef04a30194..1c1b9b2c9e9e8c 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -255,6 +255,10 @@ TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", "nc", "dot8-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts") TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts") TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_bf8, "fUiUif", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_fp8, "fUiUif", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_fp8, "fUiUif", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_bf8, "fUiUif", "nc", "gfx12-insts") //===--===// // GFX10+ only builtins. diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl index 6573325150d958..1be47f71276208 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl @@ -49,4 +49,9 @@ kernel void builtins_amdgcn_dl_insts_err( iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false);// expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}} iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}} + + fOut[5] = __builtin_amdgcn_fdot4_f32_fp8_bf8(uiA, uiB, fC);// expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_bf8' needs target feature gfx12-insts}} + fOut[6] = __builtin_amdgcn_fdot4_f32_bf8_fp8(uiA, uiB, fC);// expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_fp8' needs target feature gfx12-insts}} + fOut[7] = __builtin_amdgcn_fdot4_f32_fp8_fp8(uiA, uiB, fC);// expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_fp8' needs target feature gfx12-insts}} + fOut[8] = __builtin_amdgcn_fdot4_f32_bf8_bf8(uiA, uiB, fC);// expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_bf8' needs target feature gfx12-insts}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl new file mode 100644 index 00..31e10c0a5dc18c --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl @@ -0,0 +1,20 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s + +typedef unsigned int uint; + +// CHECK-LABEL: @builtins_amdgcn_dl_insts +// CHECK: call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %uiA, i32 %uiB, float %fC)
[clang] [llvm] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)
@@ -2696,6 +2696,25 @@ def int_amdgcn_udot8 : ImmArg>, ImmArg>, ImmArg>] >; +// f32 %r = llvm.amdgcn.dot4.f32.type_a.type_b (v4type_a (as i32) %a, v4type_b (as i32) %b, f32 %c) +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c +class AMDGPU8bitFloatDot4Intrinsic : + ClangBuiltin, + DefaultAttrsIntrinsic< +[llvm_float_ty], // %r +[ + llvm_i32_ty, // %a + llvm_i32_ty, // %b + llvm_float_ty, // %c +], +[IntrNoMem, IntrSpeculatable] + >; + +def int_amdgcn_fdot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic; +def int_amdgcn_fdot4_f32_bf8_fp8 : AMDGPU8bitFloatDot4Intrinsic; mariusz-sikora-at-amd wrote: Done, renamed fdot4 to dot4 https://github.com/llvm/llvm-project/pull/77892 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)
@@ -27,34 +27,23 @@ main_body: ret float %out0 } -define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x i16> %data, i32 %s) { +define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x bfloat> %data, i32 %s) { ; GFX12-LABEL: atomic_pk_add_bf16_1d_v2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT:image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT:v_mov_b32_e32 v1, 0 +; GFX12-NEXT:v_mov_b32_e32 v2, 0 ; GFX12-NEXT:s_waitcnt vmcnt(0) +; GFX12-NEXT:flat_store_b32 v[1:2], v0 +; GFX12-NEXT:v_mov_b32_e32 v0, 1.0 +; GFX12-NEXT:s_waitcnt lgkmcnt(0) ; GFX12-NEXT:; return to shader part epilog main_body: - %out = call <2 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2i16.v2i16(<2 x i16> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) - %out_i32 = bitcast <2 x i16> %out to i32 - %out_float = bitcast i32 %out_i32 to float - ret float %out_float -} - -define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x i16> %data, i32 %s) { -; GFX12-LABEL: atomic_pk_add_bf16_1d_v4: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT:image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT:s_waitcnt vmcnt(0) -; GFX12-NEXT:; return to shader part epilog -main_body: - %out = call <4 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4i16.v4i16(<4 x i16> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) mariusz-sikora-at-amd wrote: Yes, #77448 will fix these issue. https://github.com/llvm/llvm-project/pull/75917 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)
@@ -1368,6 +1391,28 @@ def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< // gfx908 intrinsic def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic; +// gfx12 intrinsic +def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic < + [llvm_v2i16_ty], mariusz-sikora-at-amd wrote: Done https://github.com/llvm/llvm-project/pull/75917 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [llvm] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/76224 >From 89b94cc98e188142cff11d58f27fe6c25183b376 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Thu, 21 Dec 2023 11:58:47 +0100 Subject: [PATCH 1/4] [AMDGPU][GFX12] Add Atomic cond_sub_u32 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 19 +- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 4 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 7 + llvm/lib/Target/AMDGPU/BUFInstructions.td | 14 + llvm/lib/Target/AMDGPU/DSInstructions.td | 27 +- llvm/lib/Target/AMDGPU/FLATInstructions.td| 31 +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 + llvm/lib/Target/AMDGPU/SIInstructions.td | 1 + llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 254 ++ .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 171 llvm/test/MC/AMDGPU/gfx11_unsupported.s | 12 + llvm/test/MC/AMDGPU/gfx12_asm_ds.s| 18 ++ llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s | 66 + llvm/test/MC/AMDGPU/gfx12_asm_vflat.s | 36 +++ .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt | 81 ++ .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt | 42 +++ .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt | 18 ++ 23 files changed, 812 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index cb48f54b13a6cd..2d066350ee9f84 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -10,6 +10,10 @@ // //===--===// +def flat_ptr_ty : LLVMQualPointerType<0>; +def global_ptr_ty : LLVMQualPointerType<1>; +def local_ptr_ty : LLVMQualPointerType<3>; + class AMDGPUReadPreloadRegisterIntrinsic : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; @@ -1243,6 +1247,7 @@ def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1279,6 +1284,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; +def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1317,6 +1323,7 @@ def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1352,6 +1359,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; +def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -2353,10 +2361,10 @@ def int_amdgcn_s_get_waveid_in_workgroup : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; -class AMDGPUAtomicRtn : Intrinsic < +class AMDGPUAtomicRtn : Intrinsic < [vt], - [llvm_anyptr_ty,// vaddr - vt], // vdata(VGPR) + [pt, // vaddr + vt], // vdata(VGPR) [IntrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; @@ -2491,6 +2499,11 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn; def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;
[clang] [clang-tools-extra] [llvm] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
@@ -2502,10 +2500,9 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn; def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn; def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn; -def int_amdgcn_flat_atomic_cond_sub_u32 : AMDGPUAtomicRtn; -def int_amdgcn_global_atomic_cond_sub_u32 : AMDGPUAtomicRtn; - -def int_amdgcn_ds_cond_sub_u32 : AMDGPUAtomicRtn; +def int_amdgcn_flat_atomic_cond_sub_u32 : AMDGPUAtomicRtn; +def int_amdgcn_global_atomic_cond_sub_u32 : AMDGPUAtomicRtn; +def int_amdgcn_ds_cond_sub_u32: AMDGPUAtomicRtn; mariusz-sikora-at-amd wrote: done https://github.com/llvm/llvm-project/pull/76224 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [clang] [llvm] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/76224 >From 89b94cc98e188142cff11d58f27fe6c25183b376 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Thu, 21 Dec 2023 11:58:47 +0100 Subject: [PATCH 1/5] [AMDGPU][GFX12] Add Atomic cond_sub_u32 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 19 +- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 4 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 7 + llvm/lib/Target/AMDGPU/BUFInstructions.td | 14 + llvm/lib/Target/AMDGPU/DSInstructions.td | 27 +- llvm/lib/Target/AMDGPU/FLATInstructions.td| 31 +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 + llvm/lib/Target/AMDGPU/SIInstructions.td | 1 + llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 254 ++ .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 171 llvm/test/MC/AMDGPU/gfx11_unsupported.s | 12 + llvm/test/MC/AMDGPU/gfx12_asm_ds.s| 18 ++ llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s | 66 + llvm/test/MC/AMDGPU/gfx12_asm_vflat.s | 36 +++ .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt | 81 ++ .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt | 42 +++ .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt | 18 ++ 23 files changed, 812 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index cb48f54b13a6cda..2d066350ee9f84e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -10,6 +10,10 @@ // //===--===// +def flat_ptr_ty : LLVMQualPointerType<0>; +def global_ptr_ty : LLVMQualPointerType<1>; +def local_ptr_ty : LLVMQualPointerType<3>; + class AMDGPUReadPreloadRegisterIntrinsic : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; @@ -1243,6 +1247,7 @@ def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1279,6 +1284,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; +def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1317,6 +1323,7 @@ def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1352,6 +1359,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; +def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -2353,10 +2361,10 @@ def int_amdgcn_s_get_waveid_in_workgroup : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; -class AMDGPUAtomicRtn : Intrinsic < +class AMDGPUAtomicRtn : Intrinsic < [vt], - [llvm_anyptr_ty,// vaddr - vt], // vdata(VGPR) + [pt, // vaddr + vt], // vdata(VGPR) [IntrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; @@ -2491,6 +2499,11 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn; def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;
[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
@@ -1182,6 +1182,11 @@ The AMDGPU backend implements the following LLVM IR intrinsics. The iglp_opt strategy implementations are subject to change. + llvm.atomic.cond.sub.u32 Provides direct access to flat_atomic_cond_sub_u32, global_atomic_cond_sub_u32 mariusz-sikora-at-amd wrote: ah, thanks ! https://github.com/llvm/llvm-project/pull/76224 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/76224 >From 89b94cc98e188142cff11d58f27fe6c25183b376 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Thu, 21 Dec 2023 11:58:47 +0100 Subject: [PATCH 1/6] [AMDGPU][GFX12] Add Atomic cond_sub_u32 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 19 +- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 4 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 7 + llvm/lib/Target/AMDGPU/BUFInstructions.td | 14 + llvm/lib/Target/AMDGPU/DSInstructions.td | 27 +- llvm/lib/Target/AMDGPU/FLATInstructions.td| 31 +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 + llvm/lib/Target/AMDGPU/SIInstructions.td | 1 + llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 254 ++ .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 171 llvm/test/MC/AMDGPU/gfx11_unsupported.s | 12 + llvm/test/MC/AMDGPU/gfx12_asm_ds.s| 18 ++ llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s | 66 + llvm/test/MC/AMDGPU/gfx12_asm_vflat.s | 36 +++ .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt | 81 ++ .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt | 42 +++ .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt | 18 ++ 23 files changed, 812 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index cb48f54b13a6cd..2d066350ee9f84 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -10,6 +10,10 @@ // //===--===// +def flat_ptr_ty : LLVMQualPointerType<0>; +def global_ptr_ty : LLVMQualPointerType<1>; +def local_ptr_ty : LLVMQualPointerType<3>; + class AMDGPUReadPreloadRegisterIntrinsic : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; @@ -1243,6 +1247,7 @@ def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1279,6 +1284,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; +def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1317,6 +1323,7 @@ def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1352,6 +1359,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; +def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -2353,10 +2361,10 @@ def int_amdgcn_s_get_waveid_in_workgroup : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; -class AMDGPUAtomicRtn : Intrinsic < +class AMDGPUAtomicRtn : Intrinsic < [vt], - [llvm_anyptr_ty,// vaddr - vt], // vdata(VGPR) + [pt, // vaddr + vt], // vdata(VGPR) [IntrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; @@ -2491,6 +2499,11 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn; def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;
[llvm] [clang] [AMDGPU][NFC] Rename feature FP8Insts to FP8ConversionInsts (PR #78439)
https://github.com/mariusz-sikora-at-amd created https://github.com/llvm/llvm-project/pull/78439 None >From 5bd1644ec60996fed50c843e13e68f7c2c6dda81 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Wed, 17 Jan 2024 13:19:55 +0100 Subject: [PATCH] [AMDGPU][NFC] Rename feature FP8Insts to FP8ConversionInsts --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 16 clang/test/CodeGenOpenCL/amdgpu-features.cl | 6 +++--- llvm/lib/Target/AMDGPU/AMDGPU.td | 10 ++ llvm/lib/Target/AMDGPU/GCNSubtarget.h| 5 + llvm/lib/Target/AMDGPU/VOP1Instructions.td | 4 ++-- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 4 ++-- llvm/lib/TargetParser/TargetParser.cpp | 1 + 7 files changed, 31 insertions(+), 15 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e562ef04a30194e..f02b4d321328fe2 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -397,14 +397,14 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8, "V16fV2iV4iV16fiIiI TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8, "V16fV2iV4iV16fiIiIi", "nc", "fp8-insts") TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8, "V16fV2iV4iV16fiIiIi", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion-insts") //===--===// // GFX12+ only builtins. diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 8959634572b44e9..df58cd7b62006da 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -80,9 +80,9 @@ // GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX90A: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX940: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX940: "target-fea
[clang] [llvm] [AMDGPU][NFC] Rename feature FP8Insts to FP8ConversionInsts (PR #78439)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/78439 >From 5bd1644ec60996fed50c843e13e68f7c2c6dda81 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Wed, 17 Jan 2024 13:19:55 +0100 Subject: [PATCH 1/2] [AMDGPU][NFC] Rename feature FP8Insts to FP8ConversionInsts --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 16 clang/test/CodeGenOpenCL/amdgpu-features.cl | 6 +++--- llvm/lib/Target/AMDGPU/AMDGPU.td | 10 ++ llvm/lib/Target/AMDGPU/GCNSubtarget.h| 5 + llvm/lib/Target/AMDGPU/VOP1Instructions.td | 4 ++-- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 4 ++-- llvm/lib/TargetParser/TargetParser.cpp | 1 + 7 files changed, 31 insertions(+), 15 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e562ef04a30194e..f02b4d321328fe2 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -397,14 +397,14 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8, "V16fV2iV4iV16fiIiI TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8, "V16fV2iV4iV16fiIiIi", "nc", "fp8-insts") TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8, "V16fV2iV4iV16fiIiIi", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts") -TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-conversion-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion-insts") //===--===// // GFX12+ only builtins. diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 8959634572b44e9..df58cd7b62006da 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -80,9 +80,9 @@ // GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX90A: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX940: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX940: "target-feat
[llvm] [clang-tools-extra] [clang] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/76224 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][NFC] Rename feature FP8Insts to FP8ConversionInsts (PR #78439)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/78439 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[lldb] [clang] [compiler-rt] [flang] [lld] [llvm] [libcxx] [libc] [clang-tools-extra] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/77892 >From 628a3d2b42cdcbd903e0830ab7d631ea7dc422b9 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 10 Jan 2024 12:17:58 +0100 Subject: [PATCH 1/2] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions Endoding is VOP3P. Tagged as deep/machine learning instructions. i32 type (v4fp8 or v4bf8 packed in i32) is used for src0 and src1. src0 and src1 have no src_modifiers. src2 is f32 and has src_modifiers: f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]). --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 + .../builtins-amdgcn-dl-insts-err.cl | 5 + .../builtins-amdgcn-dl-insts-gfx12.cl | 20 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 19 ++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 4 + .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 46 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 17 +- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 47 llvm/lib/Target/AMDGPU/VOPInstructions.td | 13 +- .../CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll | 255 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s | 120 + llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s | 24 ++ .../MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s | 24 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s| 24 ++ .../test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s | 27 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s | 133 + .../Disassembler/AMDGPU/gfx12_dasm_vop3p.txt | 120 + .../AMDGPU/gfx12_dasm_vop3p_dpp16.txt | 24 ++ .../AMDGPU/gfx12_dasm_vop3p_dpp8.txt | 24 ++ 19 files changed, 938 insertions(+), 12 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e562ef04a30194..1c1b9b2c9e9e8c 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -255,6 +255,10 @@ TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", "nc", "dot8-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts") TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts") TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_bf8, "fUiUif", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_fp8, "fUiUif", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_fp8, "fUiUif", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_bf8, "fUiUif", "nc", "gfx12-insts") //===--===// // GFX10+ only builtins. diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl index 6573325150d958..1be47f71276208 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl @@ -49,4 +49,9 @@ kernel void builtins_amdgcn_dl_insts_err( iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false);// expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}} iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}} + + fOut[5] = __builtin_amdgcn_fdot4_f32_fp8_bf8(uiA, uiB, fC);// expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_bf8' needs target feature gfx12-insts}} + fOut[6] = __builtin_amdgcn_fdot4_f32_bf8_fp8(uiA, uiB, fC);// expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_fp8' needs target feature gfx12-insts}} + fOut[7] = __builtin_amdgcn_fdot4_f32_fp8_fp8(uiA, uiB, fC);// expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_fp8' needs target feature gfx12-insts}} + fOut[8] = __builtin_amdgcn_fdot4_f32_bf8_bf8(uiA, uiB, fC);// expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_bf8' needs target feature gfx12-insts}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl new file mode 100644 index 00..31e10c0a5dc18c --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl @@ -0,0 +1,20 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s + +typedef unsigned int uint; + +// CHECK-LABEL: @builtins_amdgcn_dl_insts +// CHECK: call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %uiA, i32 %uiB, float %fC)
[lldb] [clang] [compiler-rt] [flang] [lld] [llvm] [libcxx] [libc] [clang-tools-extra] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)
mariusz-sikora-at-amd wrote: Rebase to run tests https://github.com/llvm/llvm-project/pull/77892 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[libcxx] [llvm] [clang-tools-extra] [libc] [clang] [flang] [compiler-rt] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)
@@ -1,56 +1,244 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -march=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-SDAG %s mariusz-sikora-at-amd wrote: Done https://github.com/llvm/llvm-project/pull/75917 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [libc] [lld] [compiler-rt] [libcxx] [clang-tools-extra] [flang] [lldb] [llvm] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/77892 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [libc] [libcxx] [clang-tools-extra] [flang] [compiler-rt] [llvm] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/75917 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)
https://github.com/mariusz-sikora-at-amd created https://github.com/llvm/llvm-project/pull/78729 __builtin_amdgcn_mfma* and __builtin_amdgcn_smfmac* >From d5a823584487d9f6b3e9bebc8976c7891243f470 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Fri, 19 Jan 2024 16:29:46 +0100 Subject: [PATCH] [AMDGPU][GFX12] Add tests for unsupported builtins __builtin_amdgcn_mfma* and __builtin_amdgcn_smfmac* --- ...ltins-amdgcn-error-unsupported-on-gfx12.cl | 105 ++ 1 file changed, 105 insertions(+) create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-unsupported-on-gfx12.cl diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-unsupported-on-gfx12.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-unsupported-on-gfx12.cl new file mode 100644 index 00..3e290f76017ffa --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-unsupported-on-gfx12.cl @@ -0,0 +1,105 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1200 -verify -S -o - %s + +#pragma OPENCL EXTENSION cl_khr_fp64:enable + +typedef float v2f __attribute__((ext_vector_type(2))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef float v16f __attribute__((ext_vector_type(16))); +typedef float v32f __attribute__((ext_vector_type(32))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef half v32h __attribute__((ext_vector_type(32))); +typedef intv2i __attribute__((ext_vector_type(2))); +typedef intv4i __attribute__((ext_vector_type(4))); +typedef intv16i __attribute__((ext_vector_type(16))); +typedef intv32i __attribute__((ext_vector_type(32))); +typedef short v2s __attribute__((ext_vector_type(2))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef short v16s __attribute__((ext_vector_type(16))); +typedef short v32s __attribute__((ext_vector_type(32))); +typedef double v4d __attribute__((ext_vector_type(4))); + +void test(global v32f*out_v32f, + global v16f*out_v16f, + global v4f* out_v4f, + global v32i*out_v32i, + global v16i*out_v16i, + global v4i* out_v4i, + global v4d* out_v4d, + global double* out_double, + double a_double , double b_double , double c_double, + float a_float , float b_float , float c_float, + int a_int , intb_int, intc_int, + long a_long, long b_long , long c_long, + v4d a_v4d , v4db_v4d, v4dc_v4d, + v8s a_v8s , v8sb_v8s, v8sc_v8s, + v4s a_v4s , v4sb_v4s, v4sc_v4s, + v2s a_v2s , v2sb_v2s, v2sc_v2s, + v2i a_v2i , v2ib_v2i, v2ic_v2i, + v16i a_v16i, v16i b_v16i , v16i c_v16i, + v32i a_v32i, v32i b_v32i , v32i c_v32i, + v4i a_v4i , v4ib_v4i, v4ic_v4i, + v2f a_v2f , v2fb_v2f, v2fc_v2f, + v4f a_v4f , v4fb_v4f, v4fc_v4f, + v16f a_v16f, v16f b_v16f , v16f c_v16f, + v32f a_v32f, v32f b_v32f , v32f c_v32f, + v4h a_v4h , v4hb_v4h, v4hc_v4h, + v8h a_v8h , v8hb_v8h, v8hc_v8h, + int idx) { + *out_v32f = __builtin_amdgcn_mfma_f32_32x32x1f32(a_float, b_float, c_v32f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x1f32' needs target feature mai-insts}} + *out_v16f = __builtin_amdgcn_mfma_f32_16x16x1f32(a_float, b_float, c_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x1f32' needs target feature mai-insts}} + *out_v4f = __builtin_amdgcn_mfma_f32_4x4x1f32(a_float, b_float, c_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_4x4x1f32' needs target feature mai-insts}} + *out_v16f = __builtin_amdgcn_mfma_f32_32x32x2f32(a_float, b_float, c_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x2f32' needs target feature mai-insts}} + *out_v4f = __builtin_amdgcn_mfma_f32_16x16x4f32(a_float, b_float, c_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f32' needs target feature mai-insts}} + *out_v32f = __builtin_amdgcn_mfma_f32_32x32x4f16(a_v4h, b_v4h, c_v32f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x4f16' needs target feature mai-insts}} + *out_v16f = __builtin_amdgcn_mfma_f32_16x16x4f16(a_v4h, b_v4h, c_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f16' needs target feature mai-insts}} + *out_v4f = __builtin_amdgcn_mfma_f32_4x4x4f16(a_v4h, b_v4h, c_v4f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_4x4x4f16' needs target feature mai-insts}} + *
[clang-tools-extra] [compiler-rt] [lldb] [clang] [libcxx] [llvm] [flang] [lld] [libc] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)
mariusz-sikora-at-amd wrote: > Can you add a GFX12 RUN line to > clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl? That will probably require > adding "fp8-conversion-insts" to the GFX12 part of TargetParser.cpp. You can > do this in a separate patch if you want. Done https://github.com/llvm/llvm-project/pull/78414 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[libc] [flang] [compiler-rt] [llvm] [clang-tools-extra] [lldb] [clang] [libcxx] [lld] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)
mariusz-sikora-at-amd wrote: > Why is so there so much special casing in the assembler/disassembler? I'm not an original author of these change, but from what I understand it is a workaround to handle VOP3 instructions which have a single source but require the use of two bits from OPSEL. `V_CVT_F32_FP8` has one source but is using two bits from OPSEL to specify which part from 32 bit register to convert ([7:0], [15:8], [23: 16] or 31 : 24]). And since OPSELs are correlated with sources/destination (one bit from OPSEL with one soruce/destination) these is required without any deeper changes to TableGen. I'm open to change TableGen, but I would prefer to create new ticket and do it with new PR. These change may take longer than one day and we would like to have these PR merged before LLVM branching. https://github.com/llvm/llvm-project/pull/78414 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/78729 >From eb04956ce8ad84206a95789885003dd6c6f60d2e Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Fri, 19 Jan 2024 16:29:46 +0100 Subject: [PATCH] [AMDGPU][GFX12] Add tests for unsupported builtins __builtin_amdgcn_mfma* and __builtin_amdgcn_smfmac* --- .../builtins-amdgcn-gfx12-err.cl | 106 +- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl index bcaea9a2482d186..413212909701c19 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl @@ -4,10 +4,114 @@ typedef unsigned int uint; -kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) { +#pragma OPENCL EXTENSION cl_khr_fp64:enable + +typedef float v2f __attribute__((ext_vector_type(2))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef float v16f __attribute__((ext_vector_type(16))); +typedef float v32f __attribute__((ext_vector_type(32))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef half v32h __attribute__((ext_vector_type(32))); +typedef intv2i __attribute__((ext_vector_type(2))); +typedef intv4i __attribute__((ext_vector_type(4))); +typedef intv16i __attribute__((ext_vector_type(16))); +typedef intv32i __attribute__((ext_vector_type(32))); +typedef short v2s __attribute__((ext_vector_type(2))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef short v16s __attribute__((ext_vector_type(16))); +typedef short v32s __attribute__((ext_vector_type(32))); +typedef double v4d __attribute__((ext_vector_type(4))); + +void builtin_test_unsupported(global v32f*out_v32f, + global v16f*out_v16f, + global v4f* out_v4f, + global v32i*out_v32i, + global v16i*out_v16i, + global v4i* out_v4i, + global v4d* out_v4d, + global double* out_double, + double a_double , double b_double , double c_double, + float a_float , float b_float , float c_float, + int a_int , intb_int, intc_int, + long a_long, long b_long , long c_long, + v4d a_v4d , v4db_v4d, v4dc_v4d, + v8s a_v8s , v8sb_v8s, v8sc_v8s, + v4s a_v4s , v4sb_v4s, v4sc_v4s, + v2s a_v2s , v2sb_v2s, v2sc_v2s, + v2i a_v2i , v2ib_v2i, v2ic_v2i, + v16i a_v16i, v16i b_v16i , v16i c_v16i, + v32i a_v32i, v32i b_v32i , v32i c_v32i, + v4i a_v4i , v4ib_v4i, v4ic_v4i, + v2f a_v2f , v2fb_v2f, v2fc_v2f, + v4f a_v4f , v4fb_v4f, v4fc_v4f, + v16f a_v16f, v16f b_v16f , v16f c_v16f, + v32f a_v32f, v32f b_v32f , v32f c_v32f, + v4h a_v4h , v4hb_v4h, v4hc_v4h, + v8h a_v8h , v8hb_v8h, v8hc_v8h, + int idx, + + uint a, uint b) { + __builtin_amdgcn_ds_gws_init(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_init' needs target feature gws}} __builtin_amdgcn_ds_gws_barrier(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_barrier' needs target feature gws}} __builtin_amdgcn_ds_gws_sema_v(a); // expected-error {{'__builtin_amdgcn_ds_gws_sema_v' needs target feature gws}} __builtin_amdgcn_ds_gws_sema_br(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_sema_br' needs target feature gws}} __builtin_amdgcn_ds_gws_sema_p(a); // expected-error {{'__builtin_amdgcn_ds_gws_sema_p' needs target feature gws}} + + *out_v32f = __builtin_amdgcn_mfma_f32_32x32x1f32(a_float, b_float, c_v32f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x1f32' needs target feature mai-insts}} + *out_v16f = __builtin_amdgcn_mfma_f32_16x16x1f32(a_float, b_float, c_v16f, 0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x1f32' needs target feature mai-insts}} +
[clang] [lldb] [flang] [lld] [libc] [libcxx] [compiler-rt] [llvm] [clang-tools-extra] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)
@@ -626,11 +629,82 @@ class Cvt_PK_F32_F8_Pat; -foreach Index = [0, -1] in { - def : Cvt_PK_F32_F8_Pat; - def : Cvt_PK_F32_F8_Pat; +let SubtargetPredicate = isGFX9Only in { + foreach Index = [0, -1] in { +def : Cvt_PK_F32_F8_Pat; +def : Cvt_PK_F32_F8_Pat; + } +} + + +// Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions. +def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F { + let InsVOP3OpSel = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, + clampmod:$clamp, omod:$omod, op_sel0:$op_sel); + + let HasOpSel = 1; + let HasExtVOP3DPP = 0; +} + +def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, i32, untyped]> { + let InsVOP3OpSel = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, + Src1Mod:$src1_modifiers, Src1RC64:$src1, + clampmod:$clamp, omod:$omod, op_sel0:$op_sel); + let AsmVOP3OpSel = !subst(", $src1_modifiers", "", getAsmVOP3OpSel<2, 0, 0, 1, 1, 0>.ret); + + let HasOpSel = 1; + let HasExtDPP = 1; + let HasExtVOP3DPP = 1; + + let Src1VOP3DPP = Src1RC64; + let AsmVOP3DPP8 = getAsmVOP3DPP8.ret; + let AsmVOP3DPP16 = getAsmVOP3DPP16.ret; +} + +let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0, +SchedRW = [WriteFloatCvt] in { + defm V_CVT_F32_FP8_OP_SEL: VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>; + defm V_CVT_F32_BF8_OP_SEL: VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>; + defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>; + defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>; +} + +class Cvt_F32_F8_Pat_OpSel index, +VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat< +(f32 (node i32:$src, index)), +!if (index, + (inst_e64 !if(index{0}, SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), $src, + !if(index{1}, SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), (i32 0), mariusz-sikora-at-amd wrote: I removed SRCMODS.OP_SEL_1 from the pattern https://github.com/llvm/llvm-project/pull/78414 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [lldb] [flang] [lld] [libc] [libcxx] [compiler-rt] [llvm] [clang-tools-extra] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)
https://github.com/mariusz-sikora-at-amd edited https://github.com/llvm/llvm-project/pull/78414 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[flang] [lld] [clang-tools-extra] [compiler-rt] [llvm] [libcxx] [lldb] [libc] [clang] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)
mariusz-sikora-at-amd wrote: > > Correct, some of these instructions use opsel[1] which in LLVM in stored in > > src1_modifiers so a dummy src1 is used. > > Why can't we just use `SRCMODS.OP_SEL_1` with src0? When referring to `SRCMODS.OP_SEL_1` you are referring to `src1_modifier` (second bit in `OPSEL`) or you are referring to `OPSEL_HI` ? https://github.com/llvm/llvm-project/pull/78414 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] ea064ee - [AMDGPU] Create Subtarget Features for some of 16 bits atomic fadd instructions
Author: Mariusz Sikora Date: 2023-03-24T13:10:40+01:00 New Revision: ea064ee2a3bd22f5598d0eb76a1bbc3bf293b063 URL: https://github.com/llvm/llvm-project/commit/ea064ee2a3bd22f5598d0eb76a1bbc3bf293b063 DIFF: https://github.com/llvm/llvm-project/commit/ea064ee2a3bd22f5598d0eb76a1bbc3bf293b063.diff LOG: [AMDGPU] Create Subtarget Features for some of 16 bits atomic fadd instructions Introducing Subtarget Features for instructions: - ds_pk_add_bf16 - ds_pk_add_f16 - ds_pk_add_rtn_bf16 - ds_pk_add_rtn_f16 - flat_atomic_pk_add_f16 - flat_atomic_pk_add_bf16 - global_atomic_pk_add_f16 - global_atomic_pk_add_bf16 - buffer_atomic_pk_add_f16 Differential Revision: https://reviews.llvm.org/D146701 Added: clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl Modified: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/lib/Basic/Targets/AMDGPU.cpp clang/test/CodeGenOpenCL/amdgpu-features.cl clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl llvm/lib/Target/AMDGPU/AMDGPU.td llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp llvm/lib/Target/AMDGPU/BUFInstructions.td llvm/lib/Target/AMDGPU/DSInstructions.td llvm/lib/Target/AMDGPU/FLATInstructions.td llvm/lib/Target/AMDGPU/GCNSubtarget.h Removed: diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 8e7449d426bff..ed75b58ddbf96 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -215,7 +215,7 @@ TARGET_BUILTIN(__builtin_amdgcn_fmed3h, "", "nc", "gfx9-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f64, "dd*1d", "t", "gfx90a-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1f", "t", "gfx90a-insts") -TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2f16, "V2hV2h*1V2h", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2f16, "V2hV2h*1V2h", "t", "atomic-buffer-global-pk-add-f16-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1d", "t", "gfx90a-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmax_f64, "dd*1d", "t", "gfx90a-insts") @@ -227,10 +227,10 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f64, "dd*3d", "t", "gfx90a-insts" TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3f", "t", "gfx8-insts") TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_f32, "ff*0f", "t", "gfx940-insts") -TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2hV2h*0V2h", "t", "gfx940-insts") -TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "gfx940-insts") -TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "gfx940-insts") -TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "gfx940-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2hV2h*0V2h", "t", "atomic-flat-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "atomic-flat-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") //===--===// // Deep learning builtins. diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index 8dd27670d1c18..72dfb07804dff 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -257,9 +257,13 @@ bool AMDGPUTargetInfo::initFeatureMap( case GK_GFX940: Features["gfx940-insts"] = true; Features["fp8-insts"] = true; + Features["atomic-ds-pk-add-16-insts"] = true; + Features["atomic-flat-pk-add-16-insts"] = true; + Features["atomic-global-pk-add-bf16-inst"] = true; [[fallthrough]]; case GK_GFX90A: Features["gfx90a-insts"] = true; + Features["atomic-buffer-global-pk-add-f16-insts"] = true; [[fallthrough]]; case GK_GFX908: Features["dot3-insts"] = true; diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 9e24290668d92..4a4da6b270b9a 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -72,9 +72,9 @@ // GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+
[clang] 69061f9 - [AMDGPU] Add clang builtin for __builtin_amdgcn_ds_atomic_fadd_v2f16
Author: Mariusz Sikora Date: 2023-03-24T16:27:44+01:00 New Revision: 69061f96275c3053623a8699ce641c0f0ac61aed URL: https://github.com/llvm/llvm-project/commit/69061f96275c3053623a8699ce641c0f0ac61aed DIFF: https://github.com/llvm/llvm-project/commit/69061f96275c3053623a8699ce641c0f0ac61aed.diff LOG: [AMDGPU] Add clang builtin for __builtin_amdgcn_ds_atomic_fadd_v2f16 Differential Revision: https://reviews.llvm.org/D146808 Added: Modified: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl Removed: diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index ed75b58ddbf96..965bd97a97d79 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -231,6 +231,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2hV2h*0V2h", "t", "ato TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "atomic-flat-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts") //===--===// // Deep learning builtins. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index b3aea13878c1c..c8112b0ea0ec0 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17213,7 +17213,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(F, {Addr, Val}); } case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: - case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: { + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: { Intrinsic::ID IID; llvm::Type *ArgTy; switch (BuiltinID) { @@ -17225,6 +17226,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, ArgTy = llvm::Type::getDoubleTy(getLLVMContext()); IID = Intrinsic::amdgcn_ds_fadd; break; +case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getHalfTy(getLLVMContext()), 2); + IID = Intrinsic::amdgcn_ds_fadd; + break; } llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); llvm::Value *Val = EmitScalarExpr(E->getArg(1)); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl index 3044fdedca36b..39191322ca6e4 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl @@ -15,4 +15,5 @@ void test_atomic_fadd(__global half2 *addrh2, __local half2 *addrh2l, half2 xh2, __builtin_amdgcn_global_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2bf16' needs target feature atomic-global-pk-add-bf16-inst}} __builtin_amdgcn_global_atomic_fadd_v2f16(addrh2, xh2); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2f16' needs target feature atomic-buffer-global-pk-add-f16-insts}} __builtin_amdgcn_ds_atomic_fadd_v2bf16(addrs2l, xs2); // expected-error{{'__builtin_amdgcn_ds_atomic_fadd_v2bf16' needs target feature atomic-ds-pk-add-16-insts}} + __builtin_amdgcn_ds_atomic_fadd_v2f16(addrh2l, xh2); // expected-error{{'__builtin_amdgcn_ds_atomic_fadd_v2f16' needs target feature atomic-ds-pk-add-16-insts}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl index fd813ac029eab..0548b825a7265 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl @@ -4,9 +4,9 @@ typedef half __attribute__((ext_vector_type(2))) half2; -void test_global_add_2f16(__global half2 *addrh2, half2 xh2, - __global float *addrf, float xf, - __global double *addr, double x) { +void test_global_fadd(__global half2 *addrh2, __local half2 *addrh2l, half2 xh2, + __global float *addrf, float xf, + __global double *addr, double x) { half2 *half_rtn; float *fp_rtn; double *rtn; @@ -18,4 +18,5 @@ void test_gl
[clang] [llvm] AMDGPU: Define v_mfma_f32_{16x16x128|32x32x64}_f8f6f4 instructions (PR #116723)
@@ -15454,6 +15454,23 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MRI.setRegClass(Op.getReg(), NewRC); } + if (TII->isMAI(MI)) { +// The ordinary src0, src1, src2 were legalized above. +// +// We have to also legalize the appended v_mfma_ld_scale_b32 operands, +// as a separate instruction. +int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::scale_src0); +if (Src0Idx != -1) { + int Src1Idx = Src0Idx + 2; + assert(Src1Idx = AMDGPU::getNamedOperandIdx( mariusz-sikora-at-amd wrote: == ? https://github.com/llvm/llvm-project/pull/116723 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU] Run DL builtin tests for new GFX (PR #130054)
https://github.com/mariusz-sikora-at-amd closed https://github.com/llvm/llvm-project/pull/130054 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits