[libcxxabi] [flang] [clang-tools-extra] [libcxx] [lldb] [clang] [llvm] [compiler-rt] [lld] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)

2023-12-07 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/74576

>From 23759746b66c33028ad2340b1e98067ebf1f8074 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Tue, 28 Jun 2022 15:24:24 -0700
Subject: [PATCH] [AMDGPU] GFX12: select @llvm.prefetch intrinsic

---
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  21 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |   2 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  22 +
 llvm/lib/Target/AMDGPU/SIISelLowering.h   |   2 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|   2 +
 llvm/lib/Target/AMDGPU/SIInstructions.td  |  12 +
 llvm/lib/Target/AMDGPU/SMInstructions.td  |  34 ++
 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 496 ++
 8 files changed, 591 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 62996a3b3fb79..f0b3ed7adc294 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3101,6 +3101,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   applyDefaultMapping(OpdMapper);
   constrainOpWithReadfirstlane(B, MI, 8); // M0
   return;
+case Intrinsic::prefetch: {
+  if (!Subtarget.hasPrefetch()) {
+MI.eraseFromParent();
+return;
+  }
+  unsigned PtrBank =
+  getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
+  if (PtrBank == AMDGPU::VGPRRegBankID) {
+MI.eraseFromParent();
+return;
+  }
+  // FIXME: There is currently no support for prefetch in global isel.
+  // There is no node equivalence and what's worse there is no MMO produced
+  // for a prefetch on global isel path.
+  // Prefetch does not affect execution so erase it for now.
+  MI.eraseFromParent();
+  return;
+}
 default: {
   if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
   AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -4830,6 +4848,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const 
MachineInstr &MI) const {
   getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
   break;
 }
+case Intrinsic::prefetch:
+  OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+  break;
 
 default:
   return getInvalidInstructionMapping();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 94b9e49b765a6..21a9b8147034f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -828,6 +828,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasInstPrefetch() const { return getGeneration() >= GFX10; }
 
+  bool hasPrefetch() const { return GFX12Insts; }
+
   // Scratch is allocated in 256 dword per wave blocks for the entire
   // wavefront. When viewed from the perspective of an arbitrary workitem, this
   // is 4-byte aligned.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7e..93af38d877c5d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -763,6 +763,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->hasMad64_32())
 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
 
+  if (Subtarget->hasPrefetch())
+setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
   setOperationAction(ISD::INTRINSIC_WO_CHAIN,
  {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
   MVT::v2i16, MVT::v2f16, MVT::i128},
@@ -3858,6 +3861,23 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
   return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
 }
 
+SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
+  if (Op->isDivergent())
+return SDValue();
+
+  switch (cast(Op)->getAddressSpace()) {
+  case AMDGPUAS::FLAT_ADDRESS:
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+break;
+  default:
+return SDValue();
+  }
+
+  return Op;
+}
+
 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
  const MachineFunction &MF) const {
   Register Reg = StringSwitch(RegName)
@@ -5395,6 +5415,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
SelectionDAG &DAG) const {
 return LowerSTACKSAVE(Op, DAG);
   case ISD::GET_ROUNDING:
 return lowerGET_ROUNDING(Op, DAG);
+  case ISD::PREFETCH:
+return lowerPREFETCH(Op, DAG);
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h 
b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c9cc149218a99..5bc091d6e84de 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h

[libcxxabi] [flang] [lld] [llvm] [compiler-rt] [lldb] [clang] [clang-tools-extra] [libcxx] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)

2023-12-07 Thread Mariusz Sikora via cfe-commits


@@ -959,6 +967,32 @@ def : GCNPat <
 }
 } // let OtherPredicates = [HasShaderCyclesRegister]
 
+def SIMM24bitPtr : ImmLeaf (Imm);}]
+>;
+
+multiclass SMPrefetchPat {
+  def : GCNPat <
+(smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 
cache_type)),
+(!cast("S_PREFETCH_"#type) $sbase, $offset, (i32 
SGPR_NULL), (i8 0))
+  >;
+
+  def : GCNPat <
+(smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
+(!cast("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), 
(i8 0))
+  >;
+
+  def : GCNPat <
+(prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)),
+(!cast("S_PREFETCH_"#type#"_PC_REL") (as_i32timm 
$offset), (i32 SGPR_NULL), (i8 0))
+  > {
+let AddedComplexity = 10;
+  }

mariusz-sikora-at-amd wrote:

Maybe for now I will remove PC_REL part.

https://github.com/llvm/llvm-project/pull/74576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libcxx] [flang] [clang-tools-extra] [libcxxabi] [compiler-rt] [clang] [lldb] [lld] [llvm] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)

2023-12-07 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/74576

>From 23759746b66c33028ad2340b1e98067ebf1f8074 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Tue, 28 Jun 2022 15:24:24 -0700
Subject: [PATCH 1/2] [AMDGPU] GFX12: select @llvm.prefetch intrinsic

---
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  21 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |   2 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  22 +
 llvm/lib/Target/AMDGPU/SIISelLowering.h   |   2 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|   2 +
 llvm/lib/Target/AMDGPU/SIInstructions.td  |  12 +
 llvm/lib/Target/AMDGPU/SMInstructions.td  |  34 ++
 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 496 ++
 8 files changed, 591 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 62996a3b3fb79..f0b3ed7adc294 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3101,6 +3101,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   applyDefaultMapping(OpdMapper);
   constrainOpWithReadfirstlane(B, MI, 8); // M0
   return;
+case Intrinsic::prefetch: {
+  if (!Subtarget.hasPrefetch()) {
+MI.eraseFromParent();
+return;
+  }
+  unsigned PtrBank =
+  getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
+  if (PtrBank == AMDGPU::VGPRRegBankID) {
+MI.eraseFromParent();
+return;
+  }
+  // FIXME: There is currently no support for prefetch in global isel.
+  // There is no node equivalence and what's worse there is no MMO produced
+  // for a prefetch on global isel path.
+  // Prefetch does not affect execution so erase it for now.
+  MI.eraseFromParent();
+  return;
+}
 default: {
   if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
   AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -4830,6 +4848,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const 
MachineInstr &MI) const {
   getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
   break;
 }
+case Intrinsic::prefetch:
+  OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+  break;
 
 default:
   return getInvalidInstructionMapping();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 94b9e49b765a6..21a9b8147034f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -828,6 +828,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasInstPrefetch() const { return getGeneration() >= GFX10; }
 
+  bool hasPrefetch() const { return GFX12Insts; }
+
   // Scratch is allocated in 256 dword per wave blocks for the entire
   // wavefront. When viewed from the perspective of an arbitrary workitem, this
   // is 4-byte aligned.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7e..93af38d877c5d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -763,6 +763,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->hasMad64_32())
 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
 
+  if (Subtarget->hasPrefetch())
+setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
   setOperationAction(ISD::INTRINSIC_WO_CHAIN,
  {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
   MVT::v2i16, MVT::v2f16, MVT::i128},
@@ -3858,6 +3861,23 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
   return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
 }
 
+SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
+  if (Op->isDivergent())
+return SDValue();
+
+  switch (cast(Op)->getAddressSpace()) {
+  case AMDGPUAS::FLAT_ADDRESS:
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+break;
+  default:
+return SDValue();
+  }
+
+  return Op;
+}
+
 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
  const MachineFunction &MF) const {
   Register Reg = StringSwitch(RegName)
@@ -5395,6 +5415,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
SelectionDAG &DAG) const {
 return LowerSTACKSAVE(Op, DAG);
   case ISD::GET_ROUNDING:
 return lowerGET_ROUNDING(Op, DAG);
+  case ISD::PREFETCH:
+return lowerPREFETCH(Op, DAG);
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h 
b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c9cc149218a99..5bc091d6e84de 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowerin

[clang] [llvm] [AMDGPU] GFX12: Add Split Workgroup Barrier (PR #74836)

2023-12-12 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

ping

https://github.com/llvm/llvm-project/pull/74836
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[compiler-rt] [clang-tools-extra] [libcxx] [llvm] [flang] [clang] [libc] [AMDGPU] Update VOP instructions for GFX12 (PR #74853)

2023-12-12 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/74853
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [mlir] [flang] [lld] [libc] [llvm] [compiler-rt] [libcxx] [lldb] [clang] [AMDGPU] GFX12: Add Split Workgroup Barrier (PR #74836)

2023-12-13 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/74836
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU][GFX12] Add new v_permlane16 variants (PR #75475)

2023-12-14 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd created 
https://github.com/llvm/llvm-project/pull/75475

None

>From c878aa8f2e331cf8c88ab6e191db663ed56d9ce7 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora 
Date: Thu, 14 Dec 2023 11:13:36 +0100
Subject: [PATCH] [AMDGPU][GFX12] Add new v_permlane16 variants

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   2 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx12.cl|  48 +
 .../builtins-amdgcn-error-gfx12-param.cl  |  14 +
 .../SemaOpenCL/builtins-amdgcn-error-gfx12.cl |  16 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  18 +
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp |  19 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   2 +
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |   4 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp|   4 +-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  30 +
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |  16 +
 .../AMDGPU/llvm.amdgcn.permlane16.var.ll  | 896 ++
 .../CodeGen/AMDGPU/permlane16_var-op-sel.ll   |  15 +
 .../AMDGPU/vcmpx-permlane16var-hazard.mir | 168 
 llvm/test/MC/AMDGPU/gfx11_unsupported.s   |   6 +
 llvm/test/MC/AMDGPU/gfx12_asm_vop3.s  |  51 +
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s  |  95 ++
 .../Disassembler/AMDGPU/gfx12_dasm_vop3.txt   |  51 +
 19 files changed, 1459 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx12-param.cl
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx12.cl
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/permlane16_var-op-sel.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vcmpx-permlane16var-hazard.mir

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 7465f13d552d6e..e562ef04a30194 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -410,6 +410,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", 
"nc", "fp8-insts")
 // GFX12+ only builtins.
 
//===--===//
 
+TARGET_BUILTIN(__builtin_amdgcn_permlane16_var,  "UiUiUiUiIbIb", "nc", 
"gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_permlanex16_var, "UiUiUiUiIbIb", "nc", 
"gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vi", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index b8d281531e218e..2899d9e5c28898 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -1,6 +1,54 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown 
-target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
 
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned int uint;
+
+// CHECK-LABEL: @test_permlane16_var(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) 
[[OUT_ADDR]], align 8
+// CHECK-NEXT:store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4
+// CHECK-NEXT:[[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:[[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:[[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
+// CHECK-NEXT:[[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane16.var(i32 
[[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false, i1 false)
+// CHECK-NEXT:[[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) 
[[OUT_ADDR]], align 8
+// CHECK-NEXT:store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
+// CHECK-NEXT:ret void
+//
+void test_permlane16_var(global uint* out, uint a, uint b, uint c) {
+  *out = __builtin_amdgcn_permlane16_var(a, b, c, 0, 0);
+}
+
+// CHECK-LABEL: @test_permlanex16_var(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5

[llvm] [libcxx] [clang] [clang-tools-extra] [lldb] [libc] [flang] [mlir] [compiler-rt] [lld] [AMDGPU] GFX12: Add Split Workgroup Barrier (PR #74836)

2023-12-14 Thread Mariusz Sikora via cfe-commits


@@ -684,6 +684,51 @@ s_rndne_f16 s5, 0xfe0b
 s_rndne_f16 s5, 0x3456
 // GFX12: encoding: [0xff,0x6e,0x85,0xbe,0x56,0x34,0x00,0x00]
 
+s_barrier_signal -2

mariusz-sikora-at-amd wrote:

Thanks !

https://github.com/llvm/llvm-project/pull/74836
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU][GFX12] Add new v_permlane16 variants (PR #75475)

2023-12-15 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

> LGTM
> 
> You could also update existing permlane tests with run lines for gfx12:
> 
> * test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
> 
> * test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
> 
> 
> This can also be a separate patch.

New patch: https://github.com/llvm/llvm-project/pull/75572

https://github.com/llvm/llvm-project/pull/75475
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU][GFX12] Add new v_permlane16 variants (PR #75475)

2023-12-15 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/75475
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang-tools-extra] [libc] [compiler-rt] [libcxx] [lldb] [mlir] [flang] [lld] [AMDGPU] GFX12: Add Split Workgroup Barrier (PR #74836)

2023-12-15 Thread Mariusz Sikora via cfe-commits


@@ -684,6 +684,51 @@ s_rndne_f16 s5, 0xfe0b
 s_rndne_f16 s5, 0x3456
 // GFX12: encoding: [0xff,0x6e,0x85,0xbe,0x56,0x34,0x00,0x00]
 
+s_barrier_signal -2

mariusz-sikora-at-amd wrote:

Patch: https://github.com/llvm/llvm-project/pull/75575

https://github.com/llvm/llvm-project/pull/74836
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[flang] [clang] [lldb] [libcxxabi] [lld] [compiler-rt] [clang-tools-extra] [llvm] [libcxx] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)

2023-12-15 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/74576

>From 23759746b66c33028ad2340b1e98067ebf1f8074 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Tue, 28 Jun 2022 15:24:24 -0700
Subject: [PATCH 1/4] [AMDGPU] GFX12: select @llvm.prefetch intrinsic

---
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  21 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |   2 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  22 +
 llvm/lib/Target/AMDGPU/SIISelLowering.h   |   2 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|   2 +
 llvm/lib/Target/AMDGPU/SIInstructions.td  |  12 +
 llvm/lib/Target/AMDGPU/SMInstructions.td  |  34 ++
 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 496 ++
 8 files changed, 591 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 62996a3b3fb79f..f0b3ed7adc294c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3101,6 +3101,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   applyDefaultMapping(OpdMapper);
   constrainOpWithReadfirstlane(B, MI, 8); // M0
   return;
+case Intrinsic::prefetch: {
+  if (!Subtarget.hasPrefetch()) {
+MI.eraseFromParent();
+return;
+  }
+  unsigned PtrBank =
+  getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
+  if (PtrBank == AMDGPU::VGPRRegBankID) {
+MI.eraseFromParent();
+return;
+  }
+  // FIXME: There is currently no support for prefetch in global isel.
+  // There is no node equivalence and what's worse there is no MMO produced
+  // for a prefetch on global isel path.
+  // Prefetch does not affect execution so erase it for now.
+  MI.eraseFromParent();
+  return;
+}
 default: {
   if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
   AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -4830,6 +4848,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const 
MachineInstr &MI) const {
   getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
   break;
 }
+case Intrinsic::prefetch:
+  OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+  break;
 
 default:
   return getInvalidInstructionMapping();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 94b9e49b765a6f..21a9b8147034fc 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -828,6 +828,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasInstPrefetch() const { return getGeneration() >= GFX10; }
 
+  bool hasPrefetch() const { return GFX12Insts; }
+
   // Scratch is allocated in 256 dword per wave blocks for the entire
   // wavefront. When viewed from the perspective of an arbitrary workitem, this
   // is 4-byte aligned.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..93af38d877c5d4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -763,6 +763,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->hasMad64_32())
 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
 
+  if (Subtarget->hasPrefetch())
+setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
   setOperationAction(ISD::INTRINSIC_WO_CHAIN,
  {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
   MVT::v2i16, MVT::v2f16, MVT::i128},
@@ -3858,6 +3861,23 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
   return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
 }
 
+SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
+  if (Op->isDivergent())
+return SDValue();
+
+  switch (cast(Op)->getAddressSpace()) {
+  case AMDGPUAS::FLAT_ADDRESS:
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+break;
+  default:
+return SDValue();
+  }
+
+  return Op;
+}
+
 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
  const MachineFunction &MF) const {
   Register Reg = StringSwitch(RegName)
@@ -5395,6 +5415,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
SelectionDAG &DAG) const {
 return LowerSTACKSAVE(Op, DAG);
   case ISD::GET_ROUNDING:
 return lowerGET_ROUNDING(Op, DAG);
+  case ISD::PREFETCH:
+return lowerPREFETCH(Op, DAG);
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h 
b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c9cc149218a997..5bc091d6e84de3 100644
--- a/llvm/lib/Target/AMDGPU/SIISe

[llvm] [lld] [clang] [compiler-rt] [libcxxabi] [lldb] [clang-tools-extra] [flang] [libcxx] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)

2023-12-15 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/74576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [lldb] [libc] [clang] [flang] [clang-tools-extra] [libcxx] [mlir] [compiler-rt] [lld] GFX12: Add LoopDataPrefetchPass (PR #75625)

2023-12-18 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/75625

>From de5303eb8a9e061dbd365922f85cad02bca5ec26 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Tue, 5 Jul 2022 11:41:29 -0700
Subject: [PATCH 1/3] GFX12: Add LoopDataPrefetchPass

It is currently disabled by default. It will need experiments on a real
HW to tune and decide on the profitability.
---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   7 +
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp  |  18 ++
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  10 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|   4 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll  |   8 +-
 .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 185 ++
 6 files changed, 231 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e8c04ecf39ba02..fdc2077868cf99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -345,6 +345,11 @@ static cl::opt EnableImageIntrinsicOptimizer(
 cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
 cl::Hidden);
 
+static cl::opt
+EnableLoopPrefetch("amdgpu-loop-prefetch",
+   cl::desc("Enable loop data prefetch on AMDGPU"),
+   cl::Hidden, cl::init(false));
+
 static cl::opt EnableMaxIlpSchedStrategy(
 "amdgpu-enable-max-ilp-scheduling-strategy",
 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
@@ -982,6 +987,8 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
 }
 
 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+  if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
+addPass(createLoopDataPrefetchPass());
   addPass(createSeparateConstOffsetFromGEPPass());
   // ReassociateGEPs exposes more opportunities for SLSR. See
   // the example in reassociate-geps-and-slsr.ll.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index f1da1a61bf4dd5..218c5b5cfdac87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1345,3 +1345,21 @@ GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
   Cost.first += (Size + 255) / 256;
   return Cost;
 }
+
+unsigned GCNTTIImpl::getPrefetchDistance() const {
+  return ST->hasPrefetch() ? 128 : 0;
+}
+
+bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
+  switch (AS) {
+  case AMDGPUAS::FLAT_ADDRESS:
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+return true;
+  default:
+break;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h 
b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 1e6c5bbfc0d75b..cd8e9fd10bbf21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -254,6 +254,16 @@ class GCNTTIImpl final : public 
BasicTTIImplBase {
   InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
  FastMathFlags FMF,
  TTI::TargetCostKind CostKind);
+
+  /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
+  unsigned getCacheLineSize() const override { return 128; }
+
+  /// How much before a load we should place the prefetch instruction.
+  /// This is currently measured in number of IR instructions.
+  unsigned getPrefetchDistance() const override;
+
+  /// \return if target want to issue a prefetch in address space \p AS.
+  bool shouldPrefetchAddressSpace(unsigned AS) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 70ef1fff274a40..717f22fb69fdd3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -245,6 +245,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, 
SDNode *Load1,
   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
 return false;
 
+  // A mayLoad instruction without a def is not a load. Likely a prefetch.
+  if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
+return false;
+
   if (isDS(Opc0) && isDS(Opc1)) {
 
 // FIXME: Handle this case:
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll 
b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 488dbe2e3189bf..8b0b6263832243 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < 
%s 2>&1 \
 ; RUN:   | FileCheck -match-full-lines -strict-whitespace -check-prefix=

[llvm] [lldb] [libc] [clang] [flang] [clang-tools-extra] [libcxx] [mlir] [compiler-rt] [lld] GFX12: Add LoopDataPrefetchPass (PR #75625)

2023-12-18 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/75625

>From de5303eb8a9e061dbd365922f85cad02bca5ec26 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Tue, 5 Jul 2022 11:41:29 -0700
Subject: [PATCH 1/4] GFX12: Add LoopDataPrefetchPass

It is currently disabled by default. It will need experiments on a real
HW to tune and decide on the profitability.
---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   7 +
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp  |  18 ++
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  10 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|   4 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll  |   8 +-
 .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 185 ++
 6 files changed, 231 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e8c04ecf39ba02..fdc2077868cf99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -345,6 +345,11 @@ static cl::opt EnableImageIntrinsicOptimizer(
 cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
 cl::Hidden);
 
+static cl::opt
+EnableLoopPrefetch("amdgpu-loop-prefetch",
+   cl::desc("Enable loop data prefetch on AMDGPU"),
+   cl::Hidden, cl::init(false));
+
 static cl::opt EnableMaxIlpSchedStrategy(
 "amdgpu-enable-max-ilp-scheduling-strategy",
 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
@@ -982,6 +987,8 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
 }
 
 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+  if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
+addPass(createLoopDataPrefetchPass());
   addPass(createSeparateConstOffsetFromGEPPass());
   // ReassociateGEPs exposes more opportunities for SLSR. See
   // the example in reassociate-geps-and-slsr.ll.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index f1da1a61bf4dd5..218c5b5cfdac87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1345,3 +1345,21 @@ GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
   Cost.first += (Size + 255) / 256;
   return Cost;
 }
+
+unsigned GCNTTIImpl::getPrefetchDistance() const {
+  return ST->hasPrefetch() ? 128 : 0;
+}
+
+bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
+  switch (AS) {
+  case AMDGPUAS::FLAT_ADDRESS:
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+return true;
+  default:
+break;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h 
b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 1e6c5bbfc0d75b..cd8e9fd10bbf21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -254,6 +254,16 @@ class GCNTTIImpl final : public 
BasicTTIImplBase {
   InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
  FastMathFlags FMF,
  TTI::TargetCostKind CostKind);
+
+  /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
+  unsigned getCacheLineSize() const override { return 128; }
+
+  /// How much before a load we should place the prefetch instruction.
+  /// This is currently measured in number of IR instructions.
+  unsigned getPrefetchDistance() const override;
+
+  /// \return if target want to issue a prefetch in address space \p AS.
+  bool shouldPrefetchAddressSpace(unsigned AS) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 70ef1fff274a40..717f22fb69fdd3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -245,6 +245,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, 
SDNode *Load1,
   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
 return false;
 
+  // A mayLoad instruction without a def is not a load. Likely a prefetch.
+  if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
+return false;
+
   if (isDS(Opc0) && isDS(Opc1)) {
 
 // FIXME: Handle this case:
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll 
b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 488dbe2e3189bf..8b0b6263832243 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < 
%s 2>&1 \
 ; RUN:   | FileCheck -match-full-lines -strict-whitespace -check-prefix=

[lldb] [clang] [compiler-rt] [libcxx] [flang] [lld] [clang-tools-extra] [mlir] [llvm] [libc] GFX12: Add LoopDataPrefetchPass (PR #75625)

2023-12-18 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/75625
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)

2023-12-19 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd created 
https://github.com/llvm/llvm-project/pull/75917

- image_atomic_pk_add_f16
- image_atomic_pk_add_bf16
- ds_pk_add_bf16
- ds_pk_add_f16
- ds_pk_add_rtn_bf16
- ds_pk_add_rtn_f16
- flat_atomic_pk_add_f16
- flat_atomic_pk_add_bf16
- global_atomic_pk_add_f16
- global_atomic_pk_add_bf16
- buffer_atomic_pk_add_f16
- buffer_atomic_pk_add_bf16

>From f0920d06a57b3bc77b50baf94c4616be597e74c3 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora 
Date: Mon, 18 Dec 2023 20:08:18 +0100
Subject: [PATCH] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions

- image_atomic_pk_add_f16
- image_atomic_pk_add_bf16
- ds_pk_add_bf16
- ds_pk_add_f16
- ds_pk_add_rtn_bf16
- ds_pk_add_rtn_f16
- flat_atomic_pk_add_f16
- flat_atomic_pk_add_bf16
- global_atomic_pk_add_f16
- global_atomic_pk_add_bf16
- buffer_atomic_pk_add_f16
- buffer_atomic_pk_add_bf16
---
 clang/test/CodeGenOpenCL/amdgpu-features.cl   |   4 +-
 .../builtins-fp-atomics-gfx12.cl  |  92 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  45 ++
 llvm/lib/Target/AMDGPU/AMDGPU.td  |   4 +
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   1 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  26 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   2 +
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   4 +
 llvm/lib/Target/AMDGPU/BUFInstructions.td |  21 +
 llvm/lib/Target/AMDGPU/DSInstructions.td  |  12 +-
 llvm/lib/Target/AMDGPU/FLATInstructions.td|   4 +
 llvm/lib/Target/AMDGPU/MIMGInstructions.td|   2 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  35 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td  |   1 +
 llvm/lib/TargetParser/TargetParser.cpp|   4 +
 .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 433 ++
 .../AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll |  60 +++
 llvm/test/MC/AMDGPU/gfx11_unsupported.s   |  18 +
 llvm/test/MC/AMDGPU/gfx12_asm_ds.s|  75 +++
 llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s | 132 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vflat.s |  60 +++
 llvm/test/MC/AMDGPU/gfx12_asm_vimage.s|  54 +++
 .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt  |  60 +++
 .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt   |  84 
 .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt  |  60 +++
 .../Disassembler/AMDGPU/gfx12_dasm_vimage.txt |  54 +++
 29 files changed, 1329 insertions(+), 21 deletions(-)
 create mode 100644 clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
 create mode 100644 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll

diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 8959634572b44e..fe1798406967e8 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -100,8 +100,8 @@
 // GFX1103: 
"target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1150: 
"target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1151: 
"target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1200: 
"target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1201: 
"target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1200: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1201: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx1

[clang-tools-extra] [llvm] [AMDGPU] Quit PromoteAllocaToVector if intrinsic is used (PR #68744)

2023-12-19 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/68744

>From 3dc3a43193247015933392b7da76c4ef77268231 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora 
Date: Tue, 10 Oct 2023 21:50:48 +0200
Subject: [PATCH 1/2] [AMDGPU] Bail if assume-like intrinsic is used in
 PromoteAllocaToVector

Attached test will cause crash without this change.
---
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp   | 2 ++
 .../test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll | 9 +
 2 files changed, 11 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 3707a960211eb4..8ec7d29e00c939 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -772,6 +772,8 @@ bool 
AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
 // Ignore assume-like intrinsics and comparisons used in assumes.
 if (isAssumeLikeIntrinsic(Inst)) {
+  if (!Inst->use_empty())
+return RejectUser(Inst, "assume-like intrinsic cannot have any users");
   UsersToRemove.push_back(Inst);
   continue;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll 
b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
index 0bba1bdce95655..5616bc0f5ef3c1 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
@@ -53,6 +53,15 @@ define amdgpu_kernel void @promote_with_objectsize(ptr 
addrspace(1) %out) #0 {
   ret void
 }
 
+; CHECK-LABEL: @promote_with_objectsize_8(
+; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [8 x i32]], ptr 
addrspace(3) @promote_with_objectsize_8.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call i32 @llvm.objectsize.i32.p3(ptr addrspace(3) [[PTR]], i1 false, 
i1 false, i1 false)
+define amdgpu_kernel void @promote_with_objectsize_8(ptr addrspace(1) %out) #0 
{
+  %alloca = alloca [8 x i32], align 4, addrspace(5)
+  %size = call i32 @llvm.objectsize.i32.p5(ptr addrspace(5) %alloca, i1 false, 
i1 false, i1 false)
+  store i32 %size, ptr addrspace(1) %out
+  ret void
+}
 ; CHECK-LABEL: @promote_alloca_used_twice_in_memcpy(
 ; CHECK: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 8 
dereferenceable(16) %arrayidx1, ptr addrspace(3) align 8 dereferenceable(16) 
%arrayidx2, i64 16, i1 false)
 define amdgpu_kernel void @promote_alloca_used_twice_in_memcpy(i32 %c) {

>From a85ab29fe8f338ff740456ea6b8890dd1212ca6c Mon Sep 17 00:00:00 2001
From: Mariusz Sikora 
Date: Tue, 21 Nov 2023 09:18:45 +0100
Subject: [PATCH 2/2] Fixup - handle objectsize in PromoteAlloca

---
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp   | 16 
 .../AMDGPU/promote-alloca-mem-intrinsics.ll  |  3 +--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 8ec7d29e00c939..90d0dc96898758 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -528,6 +528,15 @@ static Value *promoteAllocaUserToVector(
   return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
 }
 
+if (auto *Intr = dyn_cast(Inst)) {
+  if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
+Intr->replaceAllUsesWith(
+Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
+DL.getTypeAllocSize(VectorTy)));
+return nullptr;
+  }
+}
+
 llvm_unreachable("Unsupported call when promoting alloca to vector");
   }
 
@@ -770,6 +779,13 @@ bool 
AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   continue;
 }
 
+if (auto *Intr = dyn_cast(Inst)) {
+  if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
+WorkList.push_back(Inst);
+continue;
+  }
+}
+
 // Ignore assume-like intrinsics and comparisons used in assumes.
 if (isAssumeLikeIntrinsic(Inst)) {
   if (!Inst->use_empty())
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll 
b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
index 5616bc0f5ef3c1..aabd5df9568370 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
@@ -54,8 +54,7 @@ define amdgpu_kernel void @promote_with_objectsize(ptr 
addrspace(1) %out) #0 {
 }
 
 ; CHECK-LABEL: @promote_with_objectsize_8(
-; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [8 x i32]], ptr 
addrspace(3) @promote_with_objectsize_8.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: call i32 @llvm.objectsize.i32.p3(ptr addrspace(3) [[PTR]], i1 false, 
i1 false, i1 false)
+; CHECK: store i32 32, ptr addrspace(1) %out, align 4
 define amdgpu_kernel void @promote_with_objectsize_8(ptr addrspace(1) %out) #0 
{
   %alloca = alloca [8 x i32], a

[llvm] [clang] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)

2023-12-19 Thread Mariusz Sikora via cfe-commits


@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu 
gfx1200 \
+// RUN:   %s -S -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu 
gfx1200 \

mariusz-sikora-at-amd wrote:

What do you suggest ? I copied this test from other builtins-fp-atomics-gfxXX.
I thought this is a good test which covers both llvm intrinsic and ISA 
generation.

https://github.com/llvm/llvm-project/pull/75917
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [llvm] [AMDGPU] Quit PromoteAllocaToVector if intrinsic is used (PR #68744)

2023-12-19 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/68744
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [AMDGPU][GFX12] Default component broadcast store (PR #76212)

2024-01-05 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/76212

>From 06117c6124e94953f62eff3b1b87d98146f9e25e Mon Sep 17 00:00:00 2001
From: Mateja Marjanovic 
Date: Wed, 10 May 2023 16:24:38 +0200
Subject: [PATCH 1/2] [AMDGPU][GFX12] Default component broadcast store

For image and buffer stores the default behaviour on GFX12
is to set all unset components to the value of the first component.
So if we pass only X component, it will be the same as , or XY same as XYXX.

This patch simplifies the passed vector of components in InstCombine
by removing components from the end that are equal to the first component.

For image stores it also trims DMask if necessary.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 36 +--
 .../amdgcn-simplify-image-buffer-stores.ll| 32 -
 2 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 2bb7b6bd0674a2..da2f862308558b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -402,6 +402,35 @@ static APInt trimTrailingZerosInVector(InstCombiner &IC, 
Value *UseV,
   return DemandedElts;
 }
 
+// Trim elements of the end of the vector \p V, if they are
+// equal to the first element of the vector.
+static APInt defaultComponentBroadcast(Value *V) {
+  auto *VTy = cast(V->getType());
+  unsigned VWidth = VTy->getNumElements();
+  APInt DemandedElts = APInt::getAllOnes(VWidth);
+  Value *FirstComponent = findScalarElement(V, 0);
+
+  SmallVector ShuffleMask;
+  if (auto *SVI = dyn_cast(V))
+SVI->getShuffleMask(ShuffleMask);
+
+  for (int I = VWidth - 1; I > 0; --I) {
+if (ShuffleMask.empty()) {
+  auto *Elt = findScalarElement(V, I);
+  if (!Elt || (Elt != FirstComponent && !isa(Elt)))
+break;
+} else {
+  // Detect identical elements in the shufflevector result, even though
+  // findScalarElement cannot tell us what that element is.
+  if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
+break;
+}
+DemandedElts.clearBit(I);
+  }
+
+  return DemandedElts;
+}
+
 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
 IntrinsicInst &II,
 APInt DemandedElts,
@@ -1140,8 +1169,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, 
IntrinsicInst &II) const {
 if (!isa(II.getArgOperand(0)->getType()))
   break;
 
-APInt DemandedElts =
-trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
+APInt DemandedElts;
+if (AMDGPU::isGFX12Plus(*ST))
+  DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
+else
+  DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
 
 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
diff --git 
a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
 
b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
index f2d904cce7f00d..95b1d09bbd6036 100644
--- 
a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
+++ 
b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
@@ -23,7 +23,8 @@ define amdgpu_ps void 
@image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg
 ; GCN-NEXT:ret void
 ;
 ; GFX12-LABEL: @image_store_1d_store_insert_zeros_at_end(
-; GFX12-NEXT:call void @llvm.amdgcn.image.store.1d.f32.i32(float 
[[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT:[[NEWVDATA4:%.*]] = insertelement <4 x float> , float 
[[VDATA1:%.*]], i64 0
+; GFX12-NEXT:call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> 
[[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
@@ -63,9 +64,9 @@ define amdgpu_ps void 
@buffer_store_format_insert_zeros_at_end(<4 x i32> inreg %
 ; GCN-NEXT:ret void
 ;
 ; GFX12-LABEL: @buffer_store_format_insert_zeros_at_end(
-; GFX12-NEXT:[[TMP1:%.*]] = insertelement <2 x float> poison, float 
[[VDATA1:%.*]], i64 0
-; GFX12-NEXT:[[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x 
float> poison, <2 x i32> zeroinitializer
-; GFX12-NEXT:call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> 
[[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
+; GFX12-NEXT:[[TMP1:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0
+; GFX12-NEXT:[[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float 
[[VDATA1]], i64 1
+; GFX12-NEXT:call void @llvm.amdgcn.buffer.st

[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-05 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/76224

>From 89b94cc98e188142cff11d58f27fe6c25183b376 Mon Sep 17 00:00:00 2001
From: Vang Thao 
Date: Thu, 21 Dec 2023 11:58:47 +0100
Subject: [PATCH 1/2] [AMDGPU][GFX12] Add Atomic cond_sub_u32

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |   4 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   3 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   3 +
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   7 +
 llvm/lib/Target/AMDGPU/BUFInstructions.td |  14 +
 llvm/lib/Target/AMDGPU/DSInstructions.td  |  27 +-
 llvm/lib/Target/AMDGPU/FLATInstructions.td|  31 +++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  10 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td  |   1 +
 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll  | 254 ++
 .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 171 
 llvm/test/MC/AMDGPU/gfx11_unsupported.s   |  12 +
 llvm/test/MC/AMDGPU/gfx12_asm_ds.s|  18 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s |  66 +
 llvm/test/MC/AMDGPU/gfx12_asm_vflat.s |  36 +++
 .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt  |  81 ++
 .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt   |  42 +++
 .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt  |  18 ++
 23 files changed, 812 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index cb48f54b13a6cd..2d066350ee9f84 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -10,6 +10,10 @@
 //
 
//===--===//
 
+def flat_ptr_ty : LLVMQualPointerType<0>;
+def global_ptr_ty : LLVMQualPointerType<1>;
+def local_ptr_ty : LLVMQualPointerType<3>;
+
 class AMDGPUReadPreloadRegisterIntrinsic
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
 
@@ -1243,6 +1247,7 @@ def int_amdgcn_raw_buffer_atomic_or : 
AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1279,6 +1284,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : 
AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1317,6 +1323,7 @@ def int_amdgcn_struct_buffer_atomic_or : 
AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
+def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1352,6 +1359,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : 
AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic;
+def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : 
AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -2353,10 +2361,10 @@ def int_amdgcn_s_get_waveid_in_workgroup :
   Intrinsic<[llvm_i32_ty], [],
 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, 
IntrNoFree]>;
 
-class AMDGPUAtomicRtn : Intrinsic <
+class AMDGPUAtomicRtn : Intrinsic <
   [vt],
-  [llvm_anyptr_ty,// vaddr
-   vt],   // vdata(VGPR)
+  [pt,  // vaddr
+   vt], // vdata(VGPR)
   [IntrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, 
IntrNoFree], "",
   [SDNPMemOperand]>;
 
@@ -2491,6 +2499,11 @@ def int_amdgcn_flat_atomic_fmax_num   : 
AMDGPUAtomicRtn;
 def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;
 

[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-05 Thread Mariusz Sikora via cfe-commits


@@ -2502,10 +2500,9 @@ def int_amdgcn_flat_atomic_fmax_num   : 
AMDGPUAtomicRtn;
 def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;
 def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn;
 
-def int_amdgcn_flat_atomic_cond_sub_u32 : AMDGPUAtomicRtn;
-def int_amdgcn_global_atomic_cond_sub_u32 : AMDGPUAtomicRtn;
-
-def int_amdgcn_ds_cond_sub_u32 : AMDGPUAtomicRtn;
+def int_amdgcn_flat_atomic_cond_sub_u32   : AMDGPUAtomicRtn;
+def int_amdgcn_global_atomic_cond_sub_u32 : AMDGPUAtomicRtn;
+def int_amdgcn_ds_cond_sub_u32: AMDGPUAtomicRtn;

mariusz-sikora-at-amd wrote:

@arsenm  is this what you were expecting ?

https://github.com/llvm/llvm-project/pull/76224
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-05 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

Adding support in atomicrmw. This will require to add new operation to 
aromicrmw "cond_sub" or you had something else in mind @arsenm ?

https://github.com/llvm/llvm-project/pull/76224
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Default component broadcast store (PR #76212)

2024-01-10 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

ping @arsenm 

https://github.com/llvm/llvm-project/pull/76212
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [clang] [llvm] [AMDGPU][GFX12] Default component broadcast store (PR #76212)

2024-01-11 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/76212

>From 06117c6124e94953f62eff3b1b87d98146f9e25e Mon Sep 17 00:00:00 2001
From: Mateja Marjanovic 
Date: Wed, 10 May 2023 16:24:38 +0200
Subject: [PATCH 1/2] [AMDGPU][GFX12] Default component broadcast store

For image and buffer stores the default behaviour on GFX12
is to set all unset components to the value of the first component.
So if we pass only X component, it will be the same as , or XY same as XYXX.

This patch simplifies the passed vector of components in InstCombine
by removing components from the end that are equal to the first component.

For image stores it also trims DMask if necessary.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 36 +--
 .../amdgcn-simplify-image-buffer-stores.ll| 32 -
 2 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 2bb7b6bd0674a2..da2f862308558b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -402,6 +402,35 @@ static APInt trimTrailingZerosInVector(InstCombiner &IC, 
Value *UseV,
   return DemandedElts;
 }
 
+// Trim elements of the end of the vector \p V, if they are
+// equal to the first element of the vector.
+static APInt defaultComponentBroadcast(Value *V) {
+  auto *VTy = cast(V->getType());
+  unsigned VWidth = VTy->getNumElements();
+  APInt DemandedElts = APInt::getAllOnes(VWidth);
+  Value *FirstComponent = findScalarElement(V, 0);
+
+  SmallVector ShuffleMask;
+  if (auto *SVI = dyn_cast(V))
+SVI->getShuffleMask(ShuffleMask);
+
+  for (int I = VWidth - 1; I > 0; --I) {
+if (ShuffleMask.empty()) {
+  auto *Elt = findScalarElement(V, I);
+  if (!Elt || (Elt != FirstComponent && !isa(Elt)))
+break;
+} else {
+  // Detect identical elements in the shufflevector result, even though
+  // findScalarElement cannot tell us what that element is.
+  if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
+break;
+}
+DemandedElts.clearBit(I);
+  }
+
+  return DemandedElts;
+}
+
 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
 IntrinsicInst &II,
 APInt DemandedElts,
@@ -1140,8 +1169,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, 
IntrinsicInst &II) const {
 if (!isa(II.getArgOperand(0)->getType()))
   break;
 
-APInt DemandedElts =
-trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
+APInt DemandedElts;
+if (AMDGPU::isGFX12Plus(*ST))
+  DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
+else
+  DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
 
 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
diff --git 
a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
 
b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
index f2d904cce7f00d..95b1d09bbd6036 100644
--- 
a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
+++ 
b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
@@ -23,7 +23,8 @@ define amdgpu_ps void 
@image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg
 ; GCN-NEXT:ret void
 ;
 ; GFX12-LABEL: @image_store_1d_store_insert_zeros_at_end(
-; GFX12-NEXT:call void @llvm.amdgcn.image.store.1d.f32.i32(float 
[[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT:[[NEWVDATA4:%.*]] = insertelement <4 x float> , float 
[[VDATA1:%.*]], i64 0
+; GFX12-NEXT:call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> 
[[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
@@ -63,9 +64,9 @@ define amdgpu_ps void 
@buffer_store_format_insert_zeros_at_end(<4 x i32> inreg %
 ; GCN-NEXT:ret void
 ;
 ; GFX12-LABEL: @buffer_store_format_insert_zeros_at_end(
-; GFX12-NEXT:[[TMP1:%.*]] = insertelement <2 x float> poison, float 
[[VDATA1:%.*]], i64 0
-; GFX12-NEXT:[[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x 
float> poison, <2 x i32> zeroinitializer
-; GFX12-NEXT:call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> 
[[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
+; GFX12-NEXT:[[TMP1:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0
+; GFX12-NEXT:[[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float 
[[VDATA1]], i64 1
+; GFX12-NEXT:call void @llvm.amdgcn.buffer.st

[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Default component broadcast store (PR #76212)

2024-01-11 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

Merge with upstream to run tests. I will merge this changes if CI will pass.

https://github.com/llvm/llvm-project/pull/76212
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)

2024-01-31 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/78729

>From cc492d4134e4aa0aab56d01b21ec85937e49acfd Mon Sep 17 00:00:00 2001
From: Mariusz Sikora 
Date: Fri, 19 Jan 2024 16:29:46 +0100
Subject: [PATCH] [AMDGPU][GFX12] Add tests for unsupported builtins

__builtin_amdgcn_mfma* and __builtin_amdgcn_smfmac*
---
 .../builtins-amdgcn-gfx12-err.cl  | 86 ++-
 1 file changed, 85 insertions(+), 1 deletion(-)

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
index bcaea9a2482d1..f91fea1714510 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
@@ -4,10 +4,94 @@
 
 typedef unsigned int uint;
 
-kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) {
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+
+typedef float  v2f   __attribute__((ext_vector_type(2)));
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v32f  __attribute__((ext_vector_type(32)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef intv2i   __attribute__((ext_vector_type(2)));
+typedef intv4i   __attribute__((ext_vector_type(4)));
+typedef intv16i  __attribute__((ext_vector_type(16)));
+typedef intv32i  __attribute__((ext_vector_type(32)));
+typedef short  v2s   __attribute__((ext_vector_type(2)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef short  v16s  __attribute__((ext_vector_type(16)));
+typedef short  v32s  __attribute__((ext_vector_type(32)));
+typedef double v4d   __attribute__((ext_vector_type(4)));
+
+void builtin_test_unsupported(double a_double, float a_float,
+  int a_int, long  a_long,
+  v4d a_v4d,
+  v2s a_v2s, v4s a_v4s, v8s a_v8s,
+  v2i a_v2i, v4i a_v4i, v16i a_v16i, v32i a_v32i,
+  v2f a_v2f, v4f a_v4f, v16f a_v16f, v32f  a_v32f,
+  v4h a_v4h, v8h a_v8h,
+
+  uint a, uint b) {
+
   __builtin_amdgcn_ds_gws_init(a, b); // expected-error 
{{'__builtin_amdgcn_ds_gws_init' needs target feature gws}}
   __builtin_amdgcn_ds_gws_barrier(a, b); // expected-error 
{{'__builtin_amdgcn_ds_gws_barrier' needs target feature gws}}
   __builtin_amdgcn_ds_gws_sema_v(a); // expected-error 
{{'__builtin_amdgcn_ds_gws_sema_v' needs target feature gws}}
   __builtin_amdgcn_ds_gws_sema_br(a, b); // expected-error 
{{'__builtin_amdgcn_ds_gws_sema_br' needs target feature gws}}
   __builtin_amdgcn_ds_gws_sema_p(a); // expected-error 
{{'__builtin_amdgcn_ds_gws_sema_p' needs target feature gws}}
+
+  a_v32f = __builtin_amdgcn_mfma_f32_32x32x1f32(a_float, a_float, a_v32f, 0, 
0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x1f32' needs target 
feature mai-insts}}
+  a_v16f = __builtin_amdgcn_mfma_f32_16x16x1f32(a_float, a_float, a_v16f, 0, 
0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x1f32' needs target 
feature mai-insts}}
+  a_v4f =  __builtin_amdgcn_mfma_f32_4x4x1f32(a_float, a_float, a_v4f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_4x4x1f32' needs target 
feature mai-insts}}
+  a_v16f = __builtin_amdgcn_mfma_f32_32x32x2f32(a_float, a_float, a_v16f, 0, 
0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x2f32' needs target 
feature mai-insts}}
+  a_v4f =  __builtin_amdgcn_mfma_f32_16x16x4f32(a_float, a_float, a_v4f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f32' needs target 
feature mai-insts}}
+  a_v32f = __builtin_amdgcn_mfma_f32_32x32x4f16(a_v4h, a_v4h, a_v32f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x4f16' needs target 
feature mai-insts}}
+  a_v16f = __builtin_amdgcn_mfma_f32_16x16x4f16(a_v4h, a_v4h, a_v16f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f16' needs target 
feature mai-insts}}
+  a_v4f = __builtin_amdgcn_mfma_f32_4x4x4f16(a_v4h, a_v4h, a_v4f, 0, 0, 0); // 
expected-error {{'__builtin_amdgcn_mfma_f32_4x4x4f16' needs target feature 
mai-insts}}
+  a_v16f = __builtin_amdgcn_mfma_f32_32x32x8f16(a_v4h, a_v4h, a_v16f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x8f16' needs target 
feature mai-insts}}
+  a_v4f = __builtin_amdgcn_mfma_f32_16x16x16f16(a_v4h, a_v4h, a_v4f, 0, 0, 0); 
// expected-error {{'__builtin_amdgcn_mfma_f32_16x16x16f16' needs target 
feature mai-insts}}
+  a_v32i = __builtin_amdgcn_mfma_i32_32x32x4i8(a_int, a_int, a_v32i, 0, 0, 0); 
// expected-error {{'__builtin_amdgcn_mfma_i32_32x32x4i8' needs targ

[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)

2024-01-31 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/78729
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[lldb] [flang] [clang-tools-extra] [libcxx] [clang] [lld] [llvm] [compiler-rt] [libc] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)

2024-01-24 Thread Mariusz Sikora via cfe-commits
Mirko =?utf-8?q?Brkušanin?= ,
Mirko =?utf-8?q?Brkušanin?= ,Mirko Brkusanin
 ,Mariusz Sikora 
Message-ID:
In-Reply-To: 



@@ -8770,6 +8781,22 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const 
OperandVector &Operands,
   }
 }
 
+int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
+if (VdstInIdx == static_cast(Inst.getNumOperands())) {
+  Inst.addOperand(Inst.getOperand(0));
+}
+
+bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
+  Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
+  Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
+  Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;

mariusz-sikora-at-amd wrote:

Thanks, I will prepare different PRs to cover this and what Joe pointed out.

https://github.com/llvm/llvm-project/pull/78414
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)

2024-01-24 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

ping

https://github.com/llvm/llvm-project/pull/78729
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)

2024-01-24 Thread Mariusz Sikora via cfe-commits


@@ -4,10 +4,114 @@
 
 typedef unsigned int uint;
 
-kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) {
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+
+typedef float  v2f   __attribute__((ext_vector_type(2)));
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v32f  __attribute__((ext_vector_type(32)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef intv2i   __attribute__((ext_vector_type(2)));
+typedef intv4i   __attribute__((ext_vector_type(4)));
+typedef intv16i  __attribute__((ext_vector_type(16)));
+typedef intv32i  __attribute__((ext_vector_type(32)));
+typedef short  v2s   __attribute__((ext_vector_type(2)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef short  v16s  __attribute__((ext_vector_type(16)));
+typedef short  v32s  __attribute__((ext_vector_type(32)));
+typedef double v4d   __attribute__((ext_vector_type(4)));
+
+void builtin_test_unsupported(global v32f*out_v32f,
+  global v16f*out_v16f,
+  global v4f* out_v4f,
+  global v32i*out_v32i,
+  global v16i*out_v16i,
+  global v4i* out_v4i,
+  global v4d* out_v4d,
+  global double*  out_double,
+  double a_double , double b_double , double 
c_double,

mariusz-sikora-at-amd wrote:

Thanks, I will update these.

https://github.com/llvm/llvm-project/pull/78729
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [compiler-rt] [libcxx] [lldb] [pstl] [flang] [clang-tools-extra] [mlir] [openmp] [lld] [llvm] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)

2024-01-24 Thread Mariusz Sikora via cfe-commits
Mirko =?utf-8?q?Brkušanin?= ,
Mirko =?utf-8?q?Brkušanin?= ,Mirko Brkusanin
 ,Mariusz Sikora 
Message-ID:
In-Reply-To: 


https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/78414
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)

2024-01-25 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/78729

>From 56cf06f1b530d5ec62de1cc3818bf2f76dfd Mon Sep 17 00:00:00 2001
From: Mariusz Sikora 
Date: Fri, 19 Jan 2024 16:29:46 +0100
Subject: [PATCH] [AMDGPU][GFX12] Add tests for unsupported builtins

__builtin_amdgcn_mfma* and __builtin_amdgcn_smfmac*
---
 .../builtins-amdgcn-gfx12-err.cl  | 86 ++-
 1 file changed, 85 insertions(+), 1 deletion(-)

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
index bcaea9a2482d186..f91fea17145102a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
@@ -4,10 +4,94 @@
 
 typedef unsigned int uint;
 
-kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) {
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+
+typedef float  v2f   __attribute__((ext_vector_type(2)));
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v32f  __attribute__((ext_vector_type(32)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef intv2i   __attribute__((ext_vector_type(2)));
+typedef intv4i   __attribute__((ext_vector_type(4)));
+typedef intv16i  __attribute__((ext_vector_type(16)));
+typedef intv32i  __attribute__((ext_vector_type(32)));
+typedef short  v2s   __attribute__((ext_vector_type(2)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef short  v16s  __attribute__((ext_vector_type(16)));
+typedef short  v32s  __attribute__((ext_vector_type(32)));
+typedef double v4d   __attribute__((ext_vector_type(4)));
+
+void builtin_test_unsupported(double a_double, float a_float,
+  int a_int, long  a_long,
+  v4d a_v4d,
+  v2s a_v2s, v4s a_v4s, v8s a_v8s,
+  v2i a_v2i, v4i a_v4i, v16i a_v16i, v32i a_v32i,
+  v2f a_v2f, v4f a_v4f, v16f a_v16f, v32f  a_v32f,
+  v4h a_v4h, v8h a_v8h,
+
+  uint a, uint b) {
+
   __builtin_amdgcn_ds_gws_init(a, b); // expected-error 
{{'__builtin_amdgcn_ds_gws_init' needs target feature gws}}
   __builtin_amdgcn_ds_gws_barrier(a, b); // expected-error 
{{'__builtin_amdgcn_ds_gws_barrier' needs target feature gws}}
   __builtin_amdgcn_ds_gws_sema_v(a); // expected-error 
{{'__builtin_amdgcn_ds_gws_sema_v' needs target feature gws}}
   __builtin_amdgcn_ds_gws_sema_br(a, b); // expected-error 
{{'__builtin_amdgcn_ds_gws_sema_br' needs target feature gws}}
   __builtin_amdgcn_ds_gws_sema_p(a); // expected-error 
{{'__builtin_amdgcn_ds_gws_sema_p' needs target feature gws}}
+
+  a_v32f = __builtin_amdgcn_mfma_f32_32x32x1f32(a_float, a_float, a_v32f, 0, 
0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x1f32' needs target 
feature mai-insts}}
+  a_v16f = __builtin_amdgcn_mfma_f32_16x16x1f32(a_float, a_float, a_v16f, 0, 
0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x1f32' needs target 
feature mai-insts}}
+  a_v4f =  __builtin_amdgcn_mfma_f32_4x4x1f32(a_float, a_float, a_v4f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_4x4x1f32' needs target 
feature mai-insts}}
+  a_v16f = __builtin_amdgcn_mfma_f32_32x32x2f32(a_float, a_float, a_v16f, 0, 
0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x2f32' needs target 
feature mai-insts}}
+  a_v4f =  __builtin_amdgcn_mfma_f32_16x16x4f32(a_float, a_float, a_v4f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f32' needs target 
feature mai-insts}}
+  a_v32f = __builtin_amdgcn_mfma_f32_32x32x4f16(a_v4h, a_v4h, a_v32f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x4f16' needs target 
feature mai-insts}}
+  a_v16f = __builtin_amdgcn_mfma_f32_16x16x4f16(a_v4h, a_v4h, a_v16f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f16' needs target 
feature mai-insts}}
+  a_v4f = __builtin_amdgcn_mfma_f32_4x4x4f16(a_v4h, a_v4h, a_v4f, 0, 0, 0); // 
expected-error {{'__builtin_amdgcn_mfma_f32_4x4x4f16' needs target feature 
mai-insts}}
+  a_v16f = __builtin_amdgcn_mfma_f32_32x32x8f16(a_v4h, a_v4h, a_v16f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x8f16' needs target 
feature mai-insts}}
+  a_v4f = __builtin_amdgcn_mfma_f32_16x16x16f16(a_v4h, a_v4h, a_v4f, 0, 0, 0); 
// expected-error {{'__builtin_amdgcn_mfma_f32_16x16x16f16' needs target 
feature mai-insts}}
+  a_v32i = __builtin_amdgcn_mfma_i32_32x32x4i8(a_int, a_int, a_v32i, 0, 0, 0); 
// expected-error {{'__builtin_amdgcn_mfma_i32_32x32x4i8' needs 

[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)

2024-01-25 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/78729

>From 19e0554bcebf739f7ad500f64efe62b38781f7a1 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora 
Date: Fri, 19 Jan 2024 16:29:46 +0100
Subject: [PATCH] [AMDGPU][GFX12] Add tests for unsupported builtins

__builtin_amdgcn_mfma* and __builtin_amdgcn_smfmac*
---
 .../builtins-amdgcn-gfx12-err.cl  | 86 ++-
 1 file changed, 85 insertions(+), 1 deletion(-)

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
index bcaea9a2482d186..f91fea17145102a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
@@ -4,10 +4,94 @@
 
 typedef unsigned int uint;
 
-kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) {
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+
+typedef float  v2f   __attribute__((ext_vector_type(2)));
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v32f  __attribute__((ext_vector_type(32)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef intv2i   __attribute__((ext_vector_type(2)));
+typedef intv4i   __attribute__((ext_vector_type(4)));
+typedef intv16i  __attribute__((ext_vector_type(16)));
+typedef intv32i  __attribute__((ext_vector_type(32)));
+typedef short  v2s   __attribute__((ext_vector_type(2)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef short  v16s  __attribute__((ext_vector_type(16)));
+typedef short  v32s  __attribute__((ext_vector_type(32)));
+typedef double v4d   __attribute__((ext_vector_type(4)));
+
+void builtin_test_unsupported(double a_double, float a_float,
+  int a_int, long  a_long,
+  v4d a_v4d,
+  v2s a_v2s, v4s a_v4s, v8s a_v8s,
+  v2i a_v2i, v4i a_v4i, v16i a_v16i, v32i a_v32i,
+  v2f a_v2f, v4f a_v4f, v16f a_v16f, v32f  a_v32f,
+  v4h a_v4h, v8h a_v8h,
+
+  uint a, uint b) {
+
   __builtin_amdgcn_ds_gws_init(a, b); // expected-error 
{{'__builtin_amdgcn_ds_gws_init' needs target feature gws}}
   __builtin_amdgcn_ds_gws_barrier(a, b); // expected-error 
{{'__builtin_amdgcn_ds_gws_barrier' needs target feature gws}}
   __builtin_amdgcn_ds_gws_sema_v(a); // expected-error 
{{'__builtin_amdgcn_ds_gws_sema_v' needs target feature gws}}
   __builtin_amdgcn_ds_gws_sema_br(a, b); // expected-error 
{{'__builtin_amdgcn_ds_gws_sema_br' needs target feature gws}}
   __builtin_amdgcn_ds_gws_sema_p(a); // expected-error 
{{'__builtin_amdgcn_ds_gws_sema_p' needs target feature gws}}
+
+  a_v32f = __builtin_amdgcn_mfma_f32_32x32x1f32(a_float, a_float, a_v32f, 0, 
0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x1f32' needs target 
feature mai-insts}}
+  a_v16f = __builtin_amdgcn_mfma_f32_16x16x1f32(a_float, a_float, a_v16f, 0, 
0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x1f32' needs target 
feature mai-insts}}
+  a_v4f =  __builtin_amdgcn_mfma_f32_4x4x1f32(a_float, a_float, a_v4f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_4x4x1f32' needs target 
feature mai-insts}}
+  a_v16f = __builtin_amdgcn_mfma_f32_32x32x2f32(a_float, a_float, a_v16f, 0, 
0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x2f32' needs target 
feature mai-insts}}
+  a_v4f =  __builtin_amdgcn_mfma_f32_16x16x4f32(a_float, a_float, a_v4f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f32' needs target 
feature mai-insts}}
+  a_v32f = __builtin_amdgcn_mfma_f32_32x32x4f16(a_v4h, a_v4h, a_v32f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x4f16' needs target 
feature mai-insts}}
+  a_v16f = __builtin_amdgcn_mfma_f32_16x16x4f16(a_v4h, a_v4h, a_v16f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f16' needs target 
feature mai-insts}}
+  a_v4f = __builtin_amdgcn_mfma_f32_4x4x4f16(a_v4h, a_v4h, a_v4f, 0, 0, 0); // 
expected-error {{'__builtin_amdgcn_mfma_f32_4x4x4f16' needs target feature 
mai-insts}}
+  a_v16f = __builtin_amdgcn_mfma_f32_32x32x8f16(a_v4h, a_v4h, a_v16f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x8f16' needs target 
feature mai-insts}}
+  a_v4f = __builtin_amdgcn_mfma_f32_16x16x16f16(a_v4h, a_v4h, a_v4f, 0, 0, 0); 
// expected-error {{'__builtin_amdgcn_mfma_f32_16x16x16f16' needs target 
feature mai-insts}}
+  a_v32i = __builtin_amdgcn_mfma_i32_32x32x4i8(a_int, a_int, a_v32i, 0, 0, 0); 
// expected-error {{'__builtin_amdgcn_mfma_i32_32x32x4i8' needs 

[llvm] [clang-tools-extra] [clang] [AMDGPU][GFX12] Default component broadcast store (PR #76212)

2024-01-11 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/76212
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)

2024-01-12 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd created 
https://github.com/llvm/llvm-project/pull/77892

Endoding is VOP3P. Tagged as deep/machine learning instructions. i32 type 
(v4fp8 or v4bf8 packed in i32) is used for src0 and src1. src0 and src1 have no 
src_modifiers. src2 is f32 and has src_modifiers: f32 fneg(neg_lo[2]) and f32 
fabs(neg_hi[2]).

>From 628a3d2b42cdcbd903e0830ab7d631ea7dc422b9 Mon Sep 17 00:00:00 2001
From: Petar Avramovic 
Date: Wed, 10 Jan 2024 12:17:58 +0100
Subject: [PATCH] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions

Endoding is VOP3P. Tagged as deep/machine learning instructions.
i32 type (v4fp8 or v4bf8 packed in i32) is used for src0 and src1.
src0 and src1 have no src_modifiers. src2 is f32 and has src_modifiers:
f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]).
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   4 +
 .../builtins-amdgcn-dl-insts-err.cl   |   5 +
 .../builtins-amdgcn-dl-insts-gfx12.cl |  20 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 ++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   4 +
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp  |  46 
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp |  17 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  47 
 llvm/lib/Target/AMDGPU/VOPInstructions.td |  13 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll   | 255 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s | 120 +
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s   |  24 ++
 .../MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s |  24 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s|  24 ++
 .../test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s |  27 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s | 133 +
 .../Disassembler/AMDGPU/gfx12_dasm_vop3p.txt  | 120 +
 .../AMDGPU/gfx12_dasm_vop3p_dpp16.txt |  24 ++
 .../AMDGPU/gfx12_dasm_vop3p_dpp8.txt  |  24 ++
 19 files changed, 938 insertions(+), 12 deletions(-)
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194..1c1b9b2c9e9e8c 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -255,6 +255,10 @@ TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", 
"nc", "dot8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts")
 TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_bf8, "fUiUif", "nc", 
"gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_fp8, "fUiUif", "nc", 
"gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_fp8, "fUiUif", "nc", 
"gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_bf8, "fUiUif", "nc", 
"gfx12-insts")
 
 
//===--===//
 // GFX10+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
index 6573325150d958..1be47f71276208 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
@@ -49,4 +49,9 @@ kernel void builtins_amdgcn_dl_insts_err(
 
   iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false);// 
expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
   iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true); // 
expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
+
+  fOut[5] = __builtin_amdgcn_fdot4_f32_fp8_bf8(uiA, uiB, fC);// 
expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_bf8' needs target feature 
gfx12-insts}}
+  fOut[6] = __builtin_amdgcn_fdot4_f32_bf8_fp8(uiA, uiB, fC);// 
expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_fp8' needs target feature 
gfx12-insts}}
+  fOut[7] = __builtin_amdgcn_fdot4_f32_fp8_fp8(uiA, uiB, fC);// 
expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_fp8' needs target feature 
gfx12-insts}}
+  fOut[8] = __builtin_amdgcn_fdot4_f32_bf8_bf8(uiA, uiB, fC);// 
expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_bf8' needs target feature 
gfx12-insts}}
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
new file mode 100644
index 00..31e10c0a5dc18c
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
@@ -0,0 +1,20 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1

[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)

2024-01-15 Thread Mariusz Sikora via cfe-commits


@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu 
gfx1200 \
+// RUN:   %s -S -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu 
gfx1200 \
+// RUN:   -S -o - %s | FileCheck -check-prefix=GFX12 %s
+

mariusz-sikora-at-amd wrote:

Added here: 
https://github.com/llvm/llvm-project/blob/main/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl

https://github.com/llvm/llvm-project/pull/75917
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)

2024-01-15 Thread Mariusz Sikora via cfe-commits


@@ -362,24 +358,34 @@ define amdgpu_ps void 
@struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4
   ret void
 }
 
-define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x i16> %val, <4 
x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, 
<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
 ; GFX12-LABEL: struct_buffer_atomic_add_v2bf16_ret:
 ; GFX12:   ; %bb.0:
 ; GFX12-NEXT:buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen 
th:TH_ATOMIC_RETURN
+; GFX12-NEXT:v_mov_b32_e32 v1, 0
+; GFX12-NEXT:v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:s_waitcnt vmcnt(0)
+; GFX12-NEXT:flat_store_b32 v[1:2], v0
+; GFX12-NEXT:v_mov_b32_e32 v0, 1.0
+; GFX12-NEXT:s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret:
 ; GFX12-GISEL:   ; %bb.0:
 ; GFX12-GISEL-NEXT:buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen 
offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:flat_store_b32 v[1:2], v0
+; GFX12-GISEL-NEXT:v_mov_b32_e32 v0, 1.0
+; GFX12-GISEL-NEXT:s_waitcnt lgkmcnt(0)
 ; GFX12-GISEL-NEXT:; return to shader part epilog
-  %orig = call <2 x i16> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x 
i16> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
-  %r = bitcast <2 x i16> %orig to float

mariusz-sikora-at-amd wrote:

Found issue in GlobalISel and bitcast with bfloat type. I prepare fix and push 
in different change.

https://github.com/llvm/llvm-project/pull/75917
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)

2024-01-15 Thread Mariusz Sikora via cfe-commits


@@ -27,34 +27,23 @@ main_body:
   ret float %out0
 }
 
-define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x 
i16> %data, i32 %s) {
+define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x 
bfloat> %data, i32 %s) {
 ; GFX12-LABEL: atomic_pk_add_bf16_1d_v2:
 ; GFX12:   ; %bb.0: ; %main_body
 ; GFX12-NEXT:image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 
dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT:v_mov_b32_e32 v1, 0
+; GFX12-NEXT:v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:s_waitcnt vmcnt(0)
+; GFX12-NEXT:flat_store_b32 v[1:2], v0
+; GFX12-NEXT:v_mov_b32_e32 v0, 1.0
+; GFX12-NEXT:s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:; return to shader part epilog
 main_body:
-  %out = call <2 x i16> 
@llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2i16.v2i16(<2 x i16> %data, i32 %s, 
<8 x i32> %rsrc, i32 0, i32 0)
-  %out_i32 = bitcast <2 x i16> %out to i32
-  %out_float = bitcast i32 %out_i32 to float
-  ret float %out_float
-}
-
-define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x 
i16> %data, i32 %s) {
-; GFX12-LABEL: atomic_pk_add_bf16_1d_v4:
-; GFX12:   ; %bb.0: ; %main_body
-; GFX12-NEXT:image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 
dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:s_waitcnt vmcnt(0)
-; GFX12-NEXT:; return to shader part epilog
-main_body:
-  %out = call <4 x i16> 
@llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4i16.v4i16(<4 x i16> %data, i32 %s, 
<8 x i32> %rsrc, i32 0, i32 0)

mariusz-sikora-at-amd wrote:

Found issue with <4 x bfloat> and GlobalISel. I will try to debug this and 
prepare fix.

https://github.com/llvm/llvm-project/pull/75917
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang-tools-extra] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)

2024-01-15 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

What is the plan for atomic_{flat/ds/global}_bf16 builtins ? Right now they are 
accepting <2 x i16> instead of <2 x bfloat>. Do we want to create new builtins 
or we want to override them to accept both <2 x i16> and <2 x bfloat> ? 

https://github.com/llvm/llvm-project/pull/75917
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-15 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

ping

https://github.com/llvm/llvm-project/pull/76224
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-15 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/76224

>From 89b94cc98e188142cff11d58f27fe6c25183b376 Mon Sep 17 00:00:00 2001
From: Vang Thao 
Date: Thu, 21 Dec 2023 11:58:47 +0100
Subject: [PATCH 1/3] [AMDGPU][GFX12] Add Atomic cond_sub_u32

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |   4 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   3 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   3 +
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   7 +
 llvm/lib/Target/AMDGPU/BUFInstructions.td |  14 +
 llvm/lib/Target/AMDGPU/DSInstructions.td  |  27 +-
 llvm/lib/Target/AMDGPU/FLATInstructions.td|  31 +++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  10 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td  |   1 +
 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll  | 254 ++
 .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 171 
 llvm/test/MC/AMDGPU/gfx11_unsupported.s   |  12 +
 llvm/test/MC/AMDGPU/gfx12_asm_ds.s|  18 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s |  66 +
 llvm/test/MC/AMDGPU/gfx12_asm_vflat.s |  36 +++
 .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt  |  81 ++
 .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt   |  42 +++
 .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt  |  18 ++
 23 files changed, 812 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index cb48f54b13a6cd..2d066350ee9f84 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -10,6 +10,10 @@
 //
 
//===--===//
 
+def flat_ptr_ty : LLVMQualPointerType<0>;
+def global_ptr_ty : LLVMQualPointerType<1>;
+def local_ptr_ty : LLVMQualPointerType<3>;
+
 class AMDGPUReadPreloadRegisterIntrinsic
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
 
@@ -1243,6 +1247,7 @@ def int_amdgcn_raw_buffer_atomic_or : 
AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1279,6 +1284,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : 
AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1317,6 +1323,7 @@ def int_amdgcn_struct_buffer_atomic_or : 
AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
+def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1352,6 +1359,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : 
AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic;
+def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : 
AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -2353,10 +2361,10 @@ def int_amdgcn_s_get_waveid_in_workgroup :
   Intrinsic<[llvm_i32_ty], [],
 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, 
IntrNoFree]>;
 
-class AMDGPUAtomicRtn : Intrinsic <
+class AMDGPUAtomicRtn : Intrinsic <
   [vt],
-  [llvm_anyptr_ty,// vaddr
-   vt],   // vdata(VGPR)
+  [pt,  // vaddr
+   vt], // vdata(VGPR)
   [IntrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, 
IntrNoFree], "",
   [SDNPMemOperand]>;
 
@@ -2491,6 +2499,11 @@ def int_amdgcn_flat_atomic_fmax_num   : 
AMDGPUAtomicRtn;
 def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;
 

[llvm] [clang] [clang-tools-extra] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-15 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

> Missing UniformityAnalysis test for these

Done

https://github.com/llvm/llvm-project/pull/76224
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)

2024-01-16 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/77892

>From 628a3d2b42cdcbd903e0830ab7d631ea7dc422b9 Mon Sep 17 00:00:00 2001
From: Petar Avramovic 
Date: Wed, 10 Jan 2024 12:17:58 +0100
Subject: [PATCH 1/2] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions

Endoding is VOP3P. Tagged as deep/machine learning instructions.
i32 type (v4fp8 or v4bf8 packed in i32) is used for src0 and src1.
src0 and src1 have no src_modifiers. src2 is f32 and has src_modifiers:
f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]).
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   4 +
 .../builtins-amdgcn-dl-insts-err.cl   |   5 +
 .../builtins-amdgcn-dl-insts-gfx12.cl |  20 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 ++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   4 +
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp  |  46 
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp |  17 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  47 
 llvm/lib/Target/AMDGPU/VOPInstructions.td |  13 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll   | 255 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s | 120 +
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s   |  24 ++
 .../MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s |  24 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s|  24 ++
 .../test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s |  27 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s | 133 +
 .../Disassembler/AMDGPU/gfx12_dasm_vop3p.txt  | 120 +
 .../AMDGPU/gfx12_dasm_vop3p_dpp16.txt |  24 ++
 .../AMDGPU/gfx12_dasm_vop3p_dpp8.txt  |  24 ++
 19 files changed, 938 insertions(+), 12 deletions(-)
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194..1c1b9b2c9e9e8c 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -255,6 +255,10 @@ TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", 
"nc", "dot8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts")
 TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_bf8, "fUiUif", "nc", 
"gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_fp8, "fUiUif", "nc", 
"gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_fp8, "fUiUif", "nc", 
"gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_bf8, "fUiUif", "nc", 
"gfx12-insts")
 
 
//===--===//
 // GFX10+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
index 6573325150d958..1be47f71276208 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
@@ -49,4 +49,9 @@ kernel void builtins_amdgcn_dl_insts_err(
 
   iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false);// 
expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
   iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true); // 
expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
+
+  fOut[5] = __builtin_amdgcn_fdot4_f32_fp8_bf8(uiA, uiB, fC);// 
expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_bf8' needs target feature 
gfx12-insts}}
+  fOut[6] = __builtin_amdgcn_fdot4_f32_bf8_fp8(uiA, uiB, fC);// 
expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_fp8' needs target feature 
gfx12-insts}}
+  fOut[7] = __builtin_amdgcn_fdot4_f32_fp8_fp8(uiA, uiB, fC);// 
expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_fp8' needs target feature 
gfx12-insts}}
+  fOut[8] = __builtin_amdgcn_fdot4_f32_bf8_bf8(uiA, uiB, fC);// 
expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_bf8' needs target feature 
gfx12-insts}}
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
new file mode 100644
index 00..31e10c0a5dc18c
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
@@ -0,0 +1,20 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S 
-emit-llvm -o - %s | FileCheck %s
+
+typedef unsigned int uint;
+
+// CHECK-LABEL: @builtins_amdgcn_dl_insts
+// CHECK: call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %uiA, i32 %uiB, float 
%fC)

[clang] [llvm] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)

2024-01-16 Thread Mariusz Sikora via cfe-commits


@@ -2696,6 +2696,25 @@ def int_amdgcn_udot8 :
  ImmArg>,  ImmArg>, ImmArg>]
   >;
 
+// f32 %r = llvm.amdgcn.dot4.f32.type_a.type_b (v4type_a (as i32) %a, v4type_b 
(as i32) %b, f32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
+class AMDGPU8bitFloatDot4Intrinsic :
+  ClangBuiltin,
+  DefaultAttrsIntrinsic<
+[llvm_float_ty], // %r
+[
+  llvm_i32_ty,   // %a
+  llvm_i32_ty,   // %b
+  llvm_float_ty, // %c
+],
+[IntrNoMem, IntrSpeculatable]
+  >;
+
+def int_amdgcn_fdot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
+def int_amdgcn_fdot4_f32_bf8_fp8 : AMDGPU8bitFloatDot4Intrinsic;

mariusz-sikora-at-amd wrote:

Done, renamed fdot4 to dot4

https://github.com/llvm/llvm-project/pull/77892
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)

2024-01-16 Thread Mariusz Sikora via cfe-commits


@@ -27,34 +27,23 @@ main_body:
   ret float %out0
 }
 
-define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x 
i16> %data, i32 %s) {
+define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x 
bfloat> %data, i32 %s) {
 ; GFX12-LABEL: atomic_pk_add_bf16_1d_v2:
 ; GFX12:   ; %bb.0: ; %main_body
 ; GFX12-NEXT:image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 
dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT:v_mov_b32_e32 v1, 0
+; GFX12-NEXT:v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:s_waitcnt vmcnt(0)
+; GFX12-NEXT:flat_store_b32 v[1:2], v0
+; GFX12-NEXT:v_mov_b32_e32 v0, 1.0
+; GFX12-NEXT:s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:; return to shader part epilog
 main_body:
-  %out = call <2 x i16> 
@llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2i16.v2i16(<2 x i16> %data, i32 %s, 
<8 x i32> %rsrc, i32 0, i32 0)
-  %out_i32 = bitcast <2 x i16> %out to i32
-  %out_float = bitcast i32 %out_i32 to float
-  ret float %out_float
-}
-
-define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x 
i16> %data, i32 %s) {
-; GFX12-LABEL: atomic_pk_add_bf16_1d_v4:
-; GFX12:   ; %bb.0: ; %main_body
-; GFX12-NEXT:image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 
dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:s_waitcnt vmcnt(0)
-; GFX12-NEXT:; return to shader part epilog
-main_body:
-  %out = call <4 x i16> 
@llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4i16.v4i16(<4 x i16> %data, i32 %s, 
<8 x i32> %rsrc, i32 0, i32 0)

mariusz-sikora-at-amd wrote:

Yes, #77448 will fix these issue.

https://github.com/llvm/llvm-project/pull/75917
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)

2024-01-16 Thread Mariusz Sikora via cfe-commits


@@ -1368,6 +1391,28 @@ def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : 
Intrinsic<
 // gfx908 intrinsic
 def int_amdgcn_struct_buffer_atomic_fadd : 
AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_fadd : 
AMDGPUStructPtrBufferAtomic;
+// gfx12 intrinsic
+def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic <
+  [llvm_v2i16_ty],

mariusz-sikora-at-amd wrote:

Done

https://github.com/llvm/llvm-project/pull/75917
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [llvm] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-16 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/76224

>From 89b94cc98e188142cff11d58f27fe6c25183b376 Mon Sep 17 00:00:00 2001
From: Vang Thao 
Date: Thu, 21 Dec 2023 11:58:47 +0100
Subject: [PATCH 1/4] [AMDGPU][GFX12] Add Atomic cond_sub_u32

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |   4 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   3 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   3 +
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   7 +
 llvm/lib/Target/AMDGPU/BUFInstructions.td |  14 +
 llvm/lib/Target/AMDGPU/DSInstructions.td  |  27 +-
 llvm/lib/Target/AMDGPU/FLATInstructions.td|  31 +++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  10 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td  |   1 +
 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll  | 254 ++
 .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 171 
 llvm/test/MC/AMDGPU/gfx11_unsupported.s   |  12 +
 llvm/test/MC/AMDGPU/gfx12_asm_ds.s|  18 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s |  66 +
 llvm/test/MC/AMDGPU/gfx12_asm_vflat.s |  36 +++
 .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt  |  81 ++
 .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt   |  42 +++
 .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt  |  18 ++
 23 files changed, 812 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index cb48f54b13a6cd..2d066350ee9f84 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -10,6 +10,10 @@
 //
 
//===--===//
 
+def flat_ptr_ty : LLVMQualPointerType<0>;
+def global_ptr_ty : LLVMQualPointerType<1>;
+def local_ptr_ty : LLVMQualPointerType<3>;
+
 class AMDGPUReadPreloadRegisterIntrinsic
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
 
@@ -1243,6 +1247,7 @@ def int_amdgcn_raw_buffer_atomic_or : 
AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1279,6 +1284,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : 
AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1317,6 +1323,7 @@ def int_amdgcn_struct_buffer_atomic_or : 
AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
+def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1352,6 +1359,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : 
AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic;
+def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : 
AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -2353,10 +2361,10 @@ def int_amdgcn_s_get_waveid_in_workgroup :
   Intrinsic<[llvm_i32_ty], [],
 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, 
IntrNoFree]>;
 
-class AMDGPUAtomicRtn : Intrinsic <
+class AMDGPUAtomicRtn : Intrinsic <
   [vt],
-  [llvm_anyptr_ty,// vaddr
-   vt],   // vdata(VGPR)
+  [pt,  // vaddr
+   vt], // vdata(VGPR)
   [IntrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, 
IntrNoFree], "",
   [SDNPMemOperand]>;
 
@@ -2491,6 +2499,11 @@ def int_amdgcn_flat_atomic_fmax_num   : 
AMDGPUAtomicRtn;
 def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;
 

[clang] [clang-tools-extra] [llvm] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-16 Thread Mariusz Sikora via cfe-commits


@@ -2502,10 +2500,9 @@ def int_amdgcn_flat_atomic_fmax_num   : 
AMDGPUAtomicRtn;
 def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;
 def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn;
 
-def int_amdgcn_flat_atomic_cond_sub_u32 : AMDGPUAtomicRtn;
-def int_amdgcn_global_atomic_cond_sub_u32 : AMDGPUAtomicRtn;
-
-def int_amdgcn_ds_cond_sub_u32 : AMDGPUAtomicRtn;
+def int_amdgcn_flat_atomic_cond_sub_u32   : AMDGPUAtomicRtn;
+def int_amdgcn_global_atomic_cond_sub_u32 : AMDGPUAtomicRtn;
+def int_amdgcn_ds_cond_sub_u32: AMDGPUAtomicRtn;

mariusz-sikora-at-amd wrote:

done

https://github.com/llvm/llvm-project/pull/76224
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [clang] [llvm] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-16 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/76224

>From 89b94cc98e188142cff11d58f27fe6c25183b376 Mon Sep 17 00:00:00 2001
From: Vang Thao 
Date: Thu, 21 Dec 2023 11:58:47 +0100
Subject: [PATCH 1/5] [AMDGPU][GFX12] Add Atomic cond_sub_u32

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |   4 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   3 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   3 +
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   7 +
 llvm/lib/Target/AMDGPU/BUFInstructions.td |  14 +
 llvm/lib/Target/AMDGPU/DSInstructions.td  |  27 +-
 llvm/lib/Target/AMDGPU/FLATInstructions.td|  31 +++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  10 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td  |   1 +
 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll  | 254 ++
 .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 171 
 llvm/test/MC/AMDGPU/gfx11_unsupported.s   |  12 +
 llvm/test/MC/AMDGPU/gfx12_asm_ds.s|  18 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s |  66 +
 llvm/test/MC/AMDGPU/gfx12_asm_vflat.s |  36 +++
 .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt  |  81 ++
 .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt   |  42 +++
 .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt  |  18 ++
 23 files changed, 812 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index cb48f54b13a6cda..2d066350ee9f84e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -10,6 +10,10 @@
 //
 
//===--===//
 
+def flat_ptr_ty : LLVMQualPointerType<0>;
+def global_ptr_ty : LLVMQualPointerType<1>;
+def local_ptr_ty : LLVMQualPointerType<3>;
+
 class AMDGPUReadPreloadRegisterIntrinsic
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
 
@@ -1243,6 +1247,7 @@ def int_amdgcn_raw_buffer_atomic_or : 
AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1279,6 +1284,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : 
AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1317,6 +1323,7 @@ def int_amdgcn_struct_buffer_atomic_or : 
AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
+def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1352,6 +1359,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : 
AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic;
+def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : 
AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -2353,10 +2361,10 @@ def int_amdgcn_s_get_waveid_in_workgroup :
   Intrinsic<[llvm_i32_ty], [],
 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, 
IntrNoFree]>;
 
-class AMDGPUAtomicRtn : Intrinsic <
+class AMDGPUAtomicRtn : Intrinsic <
   [vt],
-  [llvm_anyptr_ty,// vaddr
-   vt],   // vdata(VGPR)
+  [pt,  // vaddr
+   vt], // vdata(VGPR)
   [IntrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, 
IntrNoFree], "",
   [SDNPMemOperand]>;
 
@@ -2491,6 +2499,11 @@ def int_amdgcn_flat_atomic_fmax_num   : 
AMDGPUAtomicRtn;
 def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;

[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-16 Thread Mariusz Sikora via cfe-commits


@@ -1182,6 +1182,11 @@ The AMDGPU backend implements the following LLVM IR 
intrinsics.
 
The iglp_opt strategy 
implementations are subject to change.
 
+  llvm.atomic.cond.sub.u32 Provides direct access to 
flat_atomic_cond_sub_u32, global_atomic_cond_sub_u32

mariusz-sikora-at-amd wrote:

ah, thanks !

https://github.com/llvm/llvm-project/pull/76224
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-16 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/76224

>From 89b94cc98e188142cff11d58f27fe6c25183b376 Mon Sep 17 00:00:00 2001
From: Vang Thao 
Date: Thu, 21 Dec 2023 11:58:47 +0100
Subject: [PATCH 1/6] [AMDGPU][GFX12] Add Atomic cond_sub_u32

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |   4 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   3 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   3 +
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   7 +
 llvm/lib/Target/AMDGPU/BUFInstructions.td |  14 +
 llvm/lib/Target/AMDGPU/DSInstructions.td  |  27 +-
 llvm/lib/Target/AMDGPU/FLATInstructions.td|  31 +++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  10 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td  |   1 +
 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll  | 254 ++
 .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 171 
 llvm/test/MC/AMDGPU/gfx11_unsupported.s   |  12 +
 llvm/test/MC/AMDGPU/gfx12_asm_ds.s|  18 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s |  66 +
 llvm/test/MC/AMDGPU/gfx12_asm_vflat.s |  36 +++
 .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt  |  81 ++
 .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt   |  42 +++
 .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt  |  18 ++
 23 files changed, 812 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index cb48f54b13a6cd..2d066350ee9f84 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -10,6 +10,10 @@
 //
 
//===--===//
 
+def flat_ptr_ty : LLVMQualPointerType<0>;
+def global_ptr_ty : LLVMQualPointerType<1>;
+def local_ptr_ty : LLVMQualPointerType<3>;
+
 class AMDGPUReadPreloadRegisterIntrinsic
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
 
@@ -1243,6 +1247,7 @@ def int_amdgcn_raw_buffer_atomic_or : 
AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1279,6 +1284,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : 
AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
 def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1317,6 +1323,7 @@ def int_amdgcn_struct_buffer_atomic_or : 
AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
+def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1352,6 +1359,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : 
AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic;
+def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : 
AMDGPUStructPtrBufferAtomic;
 def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -2353,10 +2361,10 @@ def int_amdgcn_s_get_waveid_in_workgroup :
   Intrinsic<[llvm_i32_ty], [],
 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, 
IntrNoFree]>;
 
-class AMDGPUAtomicRtn : Intrinsic <
+class AMDGPUAtomicRtn : Intrinsic <
   [vt],
-  [llvm_anyptr_ty,// vaddr
-   vt],   // vdata(VGPR)
+  [pt,  // vaddr
+   vt], // vdata(VGPR)
   [IntrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, 
IntrNoFree], "",
   [SDNPMemOperand]>;
 
@@ -2491,6 +2499,11 @@ def int_amdgcn_flat_atomic_fmax_num   : 
AMDGPUAtomicRtn;
 def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;
 

[llvm] [clang] [AMDGPU][NFC] Rename feature FP8Insts to FP8ConversionInsts (PR #78439)

2024-01-17 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd created 
https://github.com/llvm/llvm-project/pull/78439

None

>From 5bd1644ec60996fed50c843e13e68f7c2c6dda81 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora 
Date: Wed, 17 Jan 2024 13:19:55 +0100
Subject: [PATCH] [AMDGPU][NFC] Rename feature FP8Insts to FP8ConversionInsts

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def | 16 
 clang/test/CodeGenOpenCL/amdgpu-features.cl  |  6 +++---
 llvm/lib/Target/AMDGPU/AMDGPU.td | 10 ++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h|  5 +
 llvm/lib/Target/AMDGPU/VOP1Instructions.td   |  4 ++--
 llvm/lib/Target/AMDGPU/VOP3Instructions.td   |  4 ++--
 llvm/lib/TargetParser/TargetParser.cpp   |  1 +
 7 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194e..f02b4d321328fe2 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -397,14 +397,14 @@ 
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8, "V16fV2iV4iV16fiIiI
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8, 
"V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8, 
"V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
 
-TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", 
"fp8-conversion-insts")
 
 
//===--===//
 // GFX12+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 8959634572b44e9..df58cd7b62006da 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -80,9 +80,9 @@
 // GFX909: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
 // GFX90A: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
 // GFX90C: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX940: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX941: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX942: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX940: 
"target-fea

[clang] [llvm] [AMDGPU][NFC] Rename feature FP8Insts to FP8ConversionInsts (PR #78439)

2024-01-17 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/78439

>From 5bd1644ec60996fed50c843e13e68f7c2c6dda81 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora 
Date: Wed, 17 Jan 2024 13:19:55 +0100
Subject: [PATCH 1/2] [AMDGPU][NFC] Rename feature FP8Insts to
 FP8ConversionInsts

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def | 16 
 clang/test/CodeGenOpenCL/amdgpu-features.cl  |  6 +++---
 llvm/lib/Target/AMDGPU/AMDGPU.td | 10 ++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h|  5 +
 llvm/lib/Target/AMDGPU/VOP1Instructions.td   |  4 ++--
 llvm/lib/Target/AMDGPU/VOP3Instructions.td   |  4 ++--
 llvm/lib/TargetParser/TargetParser.cpp   |  1 +
 7 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194e..f02b4d321328fe2 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -397,14 +397,14 @@ 
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8, "V16fV2iV4iV16fiIiI
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8, 
"V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8, 
"V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
 
-TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", 
"fp8-conversion-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", 
"fp8-conversion-insts")
 
 
//===--===//
 // GFX12+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 8959634572b44e9..df58cd7b62006da 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -80,9 +80,9 @@
 // GFX909: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
 // GFX90A: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
 // GFX90C: 
"target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX940: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX941: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX942: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX940: 
"target-feat

[llvm] [clang-tools-extra] [clang] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-17 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/76224
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU][NFC] Rename feature FP8Insts to FP8ConversionInsts (PR #78439)

2024-01-17 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/78439
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[lldb] [clang] [compiler-rt] [flang] [lld] [llvm] [libcxx] [libc] [clang-tools-extra] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)

2024-01-18 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/77892

>From 628a3d2b42cdcbd903e0830ab7d631ea7dc422b9 Mon Sep 17 00:00:00 2001
From: Petar Avramovic 
Date: Wed, 10 Jan 2024 12:17:58 +0100
Subject: [PATCH 1/2] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions

Endoding is VOP3P. Tagged as deep/machine learning instructions.
i32 type (v4fp8 or v4bf8 packed in i32) is used for src0 and src1.
src0 and src1 have no src_modifiers. src2 is f32 and has src_modifiers:
f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]).
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   4 +
 .../builtins-amdgcn-dl-insts-err.cl   |   5 +
 .../builtins-amdgcn-dl-insts-gfx12.cl |  20 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 ++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   4 +
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp  |  46 
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp |  17 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  47 
 llvm/lib/Target/AMDGPU/VOPInstructions.td |  13 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll   | 255 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s | 120 +
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s   |  24 ++
 .../MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s |  24 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s|  24 ++
 .../test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s |  27 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s | 133 +
 .../Disassembler/AMDGPU/gfx12_dasm_vop3p.txt  | 120 +
 .../AMDGPU/gfx12_dasm_vop3p_dpp16.txt |  24 ++
 .../AMDGPU/gfx12_dasm_vop3p_dpp8.txt  |  24 ++
 19 files changed, 938 insertions(+), 12 deletions(-)
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194..1c1b9b2c9e9e8c 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -255,6 +255,10 @@ TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", 
"nc", "dot8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts")
 TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_bf8, "fUiUif", "nc", 
"gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_fp8, "fUiUif", "nc", 
"gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_fp8, "fUiUif", "nc", 
"gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_bf8, "fUiUif", "nc", 
"gfx12-insts")
 
 
//===--===//
 // GFX10+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
index 6573325150d958..1be47f71276208 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
@@ -49,4 +49,9 @@ kernel void builtins_amdgcn_dl_insts_err(
 
   iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false);// 
expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
   iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true); // 
expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
+
+  fOut[5] = __builtin_amdgcn_fdot4_f32_fp8_bf8(uiA, uiB, fC);// 
expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_bf8' needs target feature 
gfx12-insts}}
+  fOut[6] = __builtin_amdgcn_fdot4_f32_bf8_fp8(uiA, uiB, fC);// 
expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_fp8' needs target feature 
gfx12-insts}}
+  fOut[7] = __builtin_amdgcn_fdot4_f32_fp8_fp8(uiA, uiB, fC);// 
expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_fp8' needs target feature 
gfx12-insts}}
+  fOut[8] = __builtin_amdgcn_fdot4_f32_bf8_bf8(uiA, uiB, fC);// 
expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_bf8' needs target feature 
gfx12-insts}}
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
new file mode 100644
index 00..31e10c0a5dc18c
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
@@ -0,0 +1,20 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S 
-emit-llvm -o - %s | FileCheck %s
+
+typedef unsigned int uint;
+
+// CHECK-LABEL: @builtins_amdgcn_dl_insts
+// CHECK: call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %uiA, i32 %uiB, float 
%fC)

[lldb] [clang] [compiler-rt] [flang] [lld] [llvm] [libcxx] [libc] [clang-tools-extra] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)

2024-01-18 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

Rebase to run tests

https://github.com/llvm/llvm-project/pull/77892
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libcxx] [llvm] [clang-tools-extra] [libc] [clang] [flang] [compiler-rt] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)

2024-01-18 Thread Mariusz Sikora via cfe-commits


@@ -1,56 +1,244 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck 
-check-prefix=GFX12 %s
-; RUN: llc -march=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < 
%s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck 
-check-prefix=GFX12-SDAG %s

mariusz-sikora-at-amd wrote:

Done

https://github.com/llvm/llvm-project/pull/75917
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [lld] [compiler-rt] [libcxx] [clang-tools-extra] [flang] [lldb] [llvm] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)

2024-01-18 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/77892
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [libcxx] [clang-tools-extra] [flang] [compiler-rt] [llvm] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)

2024-01-18 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/75917
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)

2024-01-19 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd created 
https://github.com/llvm/llvm-project/pull/78729

__builtin_amdgcn_mfma* and __builtin_amdgcn_smfmac*

>From d5a823584487d9f6b3e9bebc8976c7891243f470 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora 
Date: Fri, 19 Jan 2024 16:29:46 +0100
Subject: [PATCH] [AMDGPU][GFX12] Add tests for unsupported builtins

__builtin_amdgcn_mfma* and __builtin_amdgcn_smfmac*
---
 ...ltins-amdgcn-error-unsupported-on-gfx12.cl | 105 ++
 1 file changed, 105 insertions(+)
 create mode 100644 
clang/test/SemaOpenCL/builtins-amdgcn-error-unsupported-on-gfx12.cl

diff --git 
a/clang/test/SemaOpenCL/builtins-amdgcn-error-unsupported-on-gfx12.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-error-unsupported-on-gfx12.cl
new file mode 100644
index 00..3e290f76017ffa
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-unsupported-on-gfx12.cl
@@ -0,0 +1,105 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1200 -verify -S -o - %s
+
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+
+typedef float  v2f   __attribute__((ext_vector_type(2)));
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v32f  __attribute__((ext_vector_type(32)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef intv2i   __attribute__((ext_vector_type(2)));
+typedef intv4i   __attribute__((ext_vector_type(4)));
+typedef intv16i  __attribute__((ext_vector_type(16)));
+typedef intv32i  __attribute__((ext_vector_type(32)));
+typedef short  v2s   __attribute__((ext_vector_type(2)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef short  v16s  __attribute__((ext_vector_type(16)));
+typedef short  v32s  __attribute__((ext_vector_type(32)));
+typedef double v4d   __attribute__((ext_vector_type(4)));
+
+void test(global v32f*out_v32f,
+  global v16f*out_v16f,
+ global v4f* out_v4f,
+ global v32i*out_v32i,
+ global v16i*out_v16i,
+ global v4i* out_v4i,
+ global v4d* out_v4d,
+ global double*  out_double,
+ double a_double , double b_double , double c_double,
+  float a_float   , float  b_float  , float  c_float,
+ int   a_int , intb_int, intc_int,
+ long  a_long, long   b_long   , long   c_long,
+ v4d   a_v4d , v4db_v4d, v4dc_v4d,
+ v8s   a_v8s , v8sb_v8s, v8sc_v8s,
+ v4s   a_v4s , v4sb_v4s, v4sc_v4s,
+ v2s   a_v2s , v2sb_v2s, v2sc_v2s,
+ v2i   a_v2i , v2ib_v2i, v2ic_v2i,
+ v16i  a_v16i, v16i   b_v16i   , v16i   c_v16i,
+ v32i  a_v32i, v32i   b_v32i   , v32i   c_v32i,
+ v4i   a_v4i , v4ib_v4i, v4ic_v4i,
+ v2f   a_v2f , v2fb_v2f, v2fc_v2f,
+ v4f   a_v4f , v4fb_v4f, v4fc_v4f,
+ v16f  a_v16f, v16f   b_v16f   , v16f   c_v16f,
+ v32f  a_v32f, v32f   b_v32f   , v32f   c_v32f,
+ v4h   a_v4h , v4hb_v4h, v4hc_v4h,
+ v8h   a_v8h , v8hb_v8h, v8hc_v8h,
+ int   idx) {
+  *out_v32f = __builtin_amdgcn_mfma_f32_32x32x1f32(a_float, b_float, c_v32f, 
0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x1f32' needs 
target feature mai-insts}}
+  *out_v16f = __builtin_amdgcn_mfma_f32_16x16x1f32(a_float, b_float, c_v16f, 
0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x1f32' needs 
target feature mai-insts}}
+  *out_v4f =  __builtin_amdgcn_mfma_f32_4x4x1f32(a_float, b_float, c_v4f, 0, 
0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_4x4x1f32' needs target 
feature mai-insts}}
+  *out_v16f = __builtin_amdgcn_mfma_f32_32x32x2f32(a_float, b_float, c_v16f, 
0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x2f32' needs 
target feature mai-insts}}
+  *out_v4f =  __builtin_amdgcn_mfma_f32_16x16x4f32(a_float, b_float, c_v4f, 0, 
0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f32' needs target 
feature mai-insts}}
+  *out_v32f = __builtin_amdgcn_mfma_f32_32x32x4f16(a_v4h, b_v4h, c_v32f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x4f16' needs target 
feature mai-insts}}
+  *out_v16f = __builtin_amdgcn_mfma_f32_16x16x4f16(a_v4h, b_v4h, c_v16f, 0, 0, 
0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x4f16' needs target 
feature mai-insts}}
+  *out_v4f = __builtin_amdgcn_mfma_f32_4x4x4f16(a_v4h, b_v4h, c_v4f, 0, 0, 0); 
// expected-error {{'__builtin_amdgcn_mfma_f32_4x4x4f16' needs target feature 
mai-insts}}
+  *

[clang-tools-extra] [compiler-rt] [lldb] [clang] [libcxx] [llvm] [flang] [lld] [libc] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)

2024-01-19 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

> Can you add a GFX12 RUN line to 
> clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl? That will probably require 
> adding "fp8-conversion-insts" to the GFX12 part of TargetParser.cpp. You can 
> do this in a separate patch if you want.

Done

https://github.com/llvm/llvm-project/pull/78414
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libc] [flang] [compiler-rt] [llvm] [clang-tools-extra] [lldb] [clang] [libcxx] [lld] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)

2024-01-22 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

> Why is so there so much special casing in the assembler/disassembler?

I'm not an original author of these change, but from what I understand it is a 
workaround to handle VOP3 instructions which have a single source but require 
the use of two bits from OPSEL.
`V_CVT_F32_FP8` has one source but is using two bits from OPSEL to specify 
which part from 32 bit register to convert ([7:0], [15:8], [23: 16] or 31 : 
24]). And since OPSELs are correlated with sources/destination (one bit from 
OPSEL with one soruce/destination) these is required without any deeper changes 
to TableGen.

I'm open to change TableGen, but I would prefer to create new ticket and do it 
with new PR. These change may take longer than one day and we would like to 
have these PR merged before LLVM branching.

https://github.com/llvm/llvm-project/pull/78414
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)

2024-01-22 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd updated 
https://github.com/llvm/llvm-project/pull/78729

>From eb04956ce8ad84206a95789885003dd6c6f60d2e Mon Sep 17 00:00:00 2001
From: Mariusz Sikora 
Date: Fri, 19 Jan 2024 16:29:46 +0100
Subject: [PATCH] [AMDGPU][GFX12] Add tests for unsupported builtins

__builtin_amdgcn_mfma* and __builtin_amdgcn_smfmac*
---
 .../builtins-amdgcn-gfx12-err.cl  | 106 +-
 1 file changed, 105 insertions(+), 1 deletion(-)

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
index bcaea9a2482d186..413212909701c19 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
@@ -4,10 +4,114 @@
 
 typedef unsigned int uint;
 
-kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) {
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+
+typedef float  v2f   __attribute__((ext_vector_type(2)));
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v32f  __attribute__((ext_vector_type(32)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef intv2i   __attribute__((ext_vector_type(2)));
+typedef intv4i   __attribute__((ext_vector_type(4)));
+typedef intv16i  __attribute__((ext_vector_type(16)));
+typedef intv32i  __attribute__((ext_vector_type(32)));
+typedef short  v2s   __attribute__((ext_vector_type(2)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef short  v16s  __attribute__((ext_vector_type(16)));
+typedef short  v32s  __attribute__((ext_vector_type(32)));
+typedef double v4d   __attribute__((ext_vector_type(4)));
+
+void builtin_test_unsupported(global v32f*out_v32f,
+  global v16f*out_v16f,
+  global v4f* out_v4f,
+  global v32i*out_v32i,
+  global v16i*out_v16i,
+  global v4i* out_v4i,
+  global v4d* out_v4d,
+  global double*  out_double,
+  double a_double , double b_double , double 
c_double,
+  float a_float   , float  b_float  , float  
c_float,
+  int   a_int , intb_int, intc_int,
+  long  a_long, long   b_long   , long   
c_long,
+  v4d   a_v4d , v4db_v4d, v4dc_v4d,
+  v8s   a_v8s , v8sb_v8s, v8sc_v8s,
+  v4s   a_v4s , v4sb_v4s, v4sc_v4s,
+  v2s   a_v2s , v2sb_v2s, v2sc_v2s,
+  v2i   a_v2i , v2ib_v2i, v2ic_v2i,
+  v16i  a_v16i, v16i   b_v16i   , v16i   
c_v16i,
+  v32i  a_v32i, v32i   b_v32i   , v32i   
c_v32i,
+  v4i   a_v4i , v4ib_v4i, v4ic_v4i,
+  v2f   a_v2f , v2fb_v2f, v2fc_v2f,
+  v4f   a_v4f , v4fb_v4f, v4fc_v4f,
+  v16f  a_v16f, v16f   b_v16f   , v16f   
c_v16f,
+  v32f  a_v32f, v32f   b_v32f   , v32f   
c_v32f,
+  v4h   a_v4h , v4hb_v4h, v4hc_v4h,
+  v8h   a_v8h , v8hb_v8h, v8hc_v8h,
+  int   idx,
+
+  uint a, uint b) {
+
   __builtin_amdgcn_ds_gws_init(a, b); // expected-error 
{{'__builtin_amdgcn_ds_gws_init' needs target feature gws}}
   __builtin_amdgcn_ds_gws_barrier(a, b); // expected-error 
{{'__builtin_amdgcn_ds_gws_barrier' needs target feature gws}}
   __builtin_amdgcn_ds_gws_sema_v(a); // expected-error 
{{'__builtin_amdgcn_ds_gws_sema_v' needs target feature gws}}
   __builtin_amdgcn_ds_gws_sema_br(a, b); // expected-error 
{{'__builtin_amdgcn_ds_gws_sema_br' needs target feature gws}}
   __builtin_amdgcn_ds_gws_sema_p(a); // expected-error 
{{'__builtin_amdgcn_ds_gws_sema_p' needs target feature gws}}
+
+  *out_v32f = __builtin_amdgcn_mfma_f32_32x32x1f32(a_float, b_float, c_v32f, 
0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_32x32x1f32' needs 
target feature mai-insts}}
+  *out_v16f = __builtin_amdgcn_mfma_f32_16x16x1f32(a_float, b_float, c_v16f, 
0, 0, 0); // expected-error {{'__builtin_amdgcn_mfma_f32_16x16x1f32' needs 
target feature mai-insts}}
+ 

[clang] [lldb] [flang] [lld] [libc] [libcxx] [compiler-rt] [llvm] [clang-tools-extra] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)

2024-01-22 Thread Mariusz Sikora via cfe-commits


@@ -626,11 +629,82 @@ class Cvt_PK_F32_F8_Pat;
 
-foreach Index = [0, -1] in {
-  def : Cvt_PK_F32_F8_Pat;
-  def : Cvt_PK_F32_F8_Pat;
+let SubtargetPredicate = isGFX9Only in {
+  foreach Index = [0, -1] in {
+def : Cvt_PK_F32_F8_Pat;
+def : Cvt_PK_F32_F8_Pat;
+  }
+}
+
+
+// Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions.
+def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F  {
+  let InsVOP3OpSel = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0,
+  clampmod:$clamp, omod:$omod, op_sel0:$op_sel);
+
+  let HasOpSel = 1;
+  let HasExtVOP3DPP = 0;
+}
+
+def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, i32, untyped]> {
+  let InsVOP3OpSel = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0,
+  Src1Mod:$src1_modifiers, Src1RC64:$src1,
+  clampmod:$clamp, omod:$omod, op_sel0:$op_sel);
+  let AsmVOP3OpSel = !subst(", $src1_modifiers", "", getAsmVOP3OpSel<2, 0, 0, 
1, 1, 0>.ret);
+
+  let HasOpSel = 1;
+  let HasExtDPP = 1;
+  let HasExtVOP3DPP = 1;
+
+  let Src1VOP3DPP = Src1RC64;
+  let AsmVOP3DPP8 = getAsmVOP3DPP8.ret;
+  let AsmVOP3DPP16 = getAsmVOP3DPP16.ret;
+}
+
+let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0,
+SchedRW = [WriteFloatCvt] in {
+  defm V_CVT_F32_FP8_OP_SEL: VOP1Inst<"v_cvt_f32_fp8_op_sel", 
VOPProfile_Base_CVT_F32_F8_OpSel>;
+  defm V_CVT_F32_BF8_OP_SEL: VOP1Inst<"v_cvt_f32_bf8_op_sel", 
VOPProfile_Base_CVT_F32_F8_OpSel>;
+  defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", 
VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
+  defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", 
VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
+}
+
+class Cvt_F32_F8_Pat_OpSel index,
+VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat<
+(f32 (node i32:$src, index)),
+!if (index,
+ (inst_e64 !if(index{0}, SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), $src,
+   !if(index{1}, SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), (i32 0),

mariusz-sikora-at-amd wrote:

I removed SRCMODS.OP_SEL_1 from the pattern

https://github.com/llvm/llvm-project/pull/78414
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lldb] [flang] [lld] [libc] [libcxx] [compiler-rt] [llvm] [clang-tools-extra] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)

2024-01-22 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd edited 
https://github.com/llvm/llvm-project/pull/78414
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[flang] [lld] [clang-tools-extra] [compiler-rt] [llvm] [libcxx] [lldb] [libc] [clang] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (PR #78414)

2024-01-22 Thread Mariusz Sikora via cfe-commits

mariusz-sikora-at-amd wrote:

> > Correct, some of these instructions use opsel[1] which in LLVM in stored in 
> > src1_modifiers so a dummy src1 is used.
> 
> Why can't we just use `SRCMODS.OP_SEL_1` with src0?

When referring to `SRCMODS.OP_SEL_1` you are referring to `src1_modifier` 
(second bit in `OPSEL`) or you are referring to `OPSEL_HI` ?

https://github.com/llvm/llvm-project/pull/78414
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] ea064ee - [AMDGPU] Create Subtarget Features for some of 16 bits atomic fadd instructions

2023-03-24 Thread Mariusz Sikora via cfe-commits

Author: Mariusz Sikora
Date: 2023-03-24T13:10:40+01:00
New Revision: ea064ee2a3bd22f5598d0eb76a1bbc3bf293b063

URL: 
https://github.com/llvm/llvm-project/commit/ea064ee2a3bd22f5598d0eb76a1bbc3bf293b063
DIFF: 
https://github.com/llvm/llvm-project/commit/ea064ee2a3bd22f5598d0eb76a1bbc3bf293b063.diff

LOG: [AMDGPU] Create Subtarget Features for some of 16 bits atomic fadd 
instructions

Introducing Subtarget Features for instructions:
- ds_pk_add_bf16
- ds_pk_add_f16
- ds_pk_add_rtn_bf16
- ds_pk_add_rtn_f16
- flat_atomic_pk_add_f16
- flat_atomic_pk_add_bf16
- global_atomic_pk_add_f16
- global_atomic_pk_add_bf16
- buffer_atomic_pk_add_f16

Differential Revision: https://reviews.llvm.org/D146701

Added: 
clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl

Modified: 
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/lib/Basic/Targets/AMDGPU.cpp
clang/test/CodeGenOpenCL/amdgpu-features.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/BUFInstructions.td
llvm/lib/Target/AMDGPU/DSInstructions.td
llvm/lib/Target/AMDGPU/FLATInstructions.td
llvm/lib/Target/AMDGPU/GCNSubtarget.h

Removed: 




diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 8e7449d426bff..ed75b58ddbf96 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -215,7 +215,7 @@ TARGET_BUILTIN(__builtin_amdgcn_fmed3h, "", "nc", 
"gfx9-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f64, "dd*1d", "t", 
"gfx90a-insts")
 TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1f", "t", 
"gfx90a-insts")
-TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2f16, "V2hV2h*1V2h", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2f16, "V2hV2h*1V2h", "t", 
"atomic-buffer-global-pk-add-f16-insts")
 TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1d", "t", 
"gfx90a-insts")
 TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmax_f64, "dd*1d", "t", 
"gfx90a-insts")
 
@@ -227,10 +227,10 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f64, 
"dd*3d", "t", "gfx90a-insts"
 TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3f", "t", "gfx8-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_f32, "ff*0f", "t", 
"gfx940-insts")
-TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2hV2h*0V2h", "t", 
"gfx940-insts")
-TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", 
"gfx940-insts")
-TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", 
"gfx940-insts")
-TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", 
"gfx940-insts")
+TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2hV2h*0V2h", "t", 
"atomic-flat-pk-add-16-insts")
+TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", 
"atomic-flat-pk-add-16-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", 
"atomic-global-pk-add-bf16-inst")
+TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", 
"atomic-ds-pk-add-16-insts")
 
 
//===--===//
 // Deep learning builtins.

diff  --git a/clang/lib/Basic/Targets/AMDGPU.cpp 
b/clang/lib/Basic/Targets/AMDGPU.cpp
index 8dd27670d1c18..72dfb07804dff 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -257,9 +257,13 @@ bool AMDGPUTargetInfo::initFeatureMap(
 case GK_GFX940:
   Features["gfx940-insts"] = true;
   Features["fp8-insts"] = true;
+  Features["atomic-ds-pk-add-16-insts"] = true;
+  Features["atomic-flat-pk-add-16-insts"] = true;
+  Features["atomic-global-pk-add-bf16-inst"] = true;
   [[fallthrough]];
 case GK_GFX90A:
   Features["gfx90a-insts"] = true;
+  Features["atomic-buffer-global-pk-add-f16-insts"] = true;
   [[fallthrough]];
 case GK_GFX908:
   Features["dot3-insts"] = true;

diff  --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 9e24290668d92..4a4da6b270b9a 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -72,9 +72,9 @@
 // GFX906: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
 // GFX908: 
"target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+

[clang] 69061f9 - [AMDGPU] Add clang builtin for __builtin_amdgcn_ds_atomic_fadd_v2f16

2023-03-24 Thread Mariusz Sikora via cfe-commits

Author: Mariusz Sikora
Date: 2023-03-24T16:27:44+01:00
New Revision: 69061f96275c3053623a8699ce641c0f0ac61aed

URL: 
https://github.com/llvm/llvm-project/commit/69061f96275c3053623a8699ce641c0f0ac61aed
DIFF: 
https://github.com/llvm/llvm-project/commit/69061f96275c3053623a8699ce641c0f0ac61aed.diff

LOG: [AMDGPU] Add clang builtin for __builtin_amdgcn_ds_atomic_fadd_v2f16

Differential Revision: https://reviews.llvm.org/D146808

Added: 


Modified: 
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl
clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl

Removed: 




diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index ed75b58ddbf96..965bd97a97d79 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -231,6 +231,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, 
"V2hV2h*0V2h", "t", "ato
 TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", 
"atomic-flat-pk-add-16-insts")
 TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", 
"atomic-global-pk-add-bf16-inst")
 TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", 
"atomic-ds-pk-add-16-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", 
"atomic-ds-pk-add-16-insts")
 
 
//===--===//
 // Deep learning builtins.

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b3aea13878c1c..c8112b0ea0ec0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17213,7 +17213,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 return Builder.CreateCall(F, {Addr, Val});
   }
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: {
+  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy;
 switch (BuiltinID) {
@@ -17225,6 +17226,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
   IID = Intrinsic::amdgcn_ds_fadd;
   break;
+case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
+  ArgTy = llvm::FixedVectorType::get(
+  llvm::Type::getHalfTy(getLLVMContext()), 2);
+  IID = Intrinsic::amdgcn_ds_fadd;
+  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl
index 3044fdedca36b..39191322ca6e4 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx11-err.cl
@@ -15,4 +15,5 @@ void test_atomic_fadd(__global half2 *addrh2, __local half2 
*addrh2l, half2 xh2,
   __builtin_amdgcn_global_atomic_fadd_v2bf16(addrs2, xs2); // 
expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2bf16' needs target 
feature atomic-global-pk-add-bf16-inst}}
   __builtin_amdgcn_global_atomic_fadd_v2f16(addrh2, xh2); // 
expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2f16' needs target 
feature atomic-buffer-global-pk-add-f16-insts}}
   __builtin_amdgcn_ds_atomic_fadd_v2bf16(addrs2l, xs2); // 
expected-error{{'__builtin_amdgcn_ds_atomic_fadd_v2bf16' needs target feature 
atomic-ds-pk-add-16-insts}}
+  __builtin_amdgcn_ds_atomic_fadd_v2f16(addrh2l, xh2); // 
expected-error{{'__builtin_amdgcn_ds_atomic_fadd_v2f16' needs target feature 
atomic-ds-pk-add-16-insts}}
 }

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl
index fd813ac029eab..0548b825a7265 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl
@@ -4,9 +4,9 @@
 
 typedef half __attribute__((ext_vector_type(2))) half2;
 
-void test_global_add_2f16(__global half2 *addrh2, half2 xh2,
-  __global float *addrf, float xf,
-  __global double *addr, double x) {
+void test_global_fadd(__global half2 *addrh2, __local half2 *addrh2l, half2 
xh2,
+  __global float *addrf, float xf,
+  __global double *addr, double x) {
   half2 *half_rtn;
   float *fp_rtn;
   double *rtn;
@@ -18,4 +18,5 @@ void test_gl

[clang] [llvm] AMDGPU: Define v_mfma_f32_{16x16x128|32x32x64}_f8f6f4 instructions (PR #116723)

2024-11-21 Thread Mariusz Sikora via cfe-commits


@@ -15454,6 +15454,23 @@ void 
SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
 MRI.setRegClass(Op.getReg(), NewRC);
   }
 
+  if (TII->isMAI(MI)) {
+// The ordinary src0, src1, src2 were legalized above.
+//
+// We have to also legalize the appended v_mfma_ld_scale_b32 operands,
+// as a separate instruction.
+int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::scale_src0);
+if (Src0Idx != -1) {
+  int Src1Idx = Src0Idx + 2;
+  assert(Src1Idx = AMDGPU::getNamedOperandIdx(

mariusz-sikora-at-amd wrote:

== ?

https://github.com/llvm/llvm-project/pull/116723
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU] Run DL builtin tests for new GFX (PR #130054)

2025-03-06 Thread Mariusz Sikora via cfe-commits

https://github.com/mariusz-sikora-at-amd closed 
https://github.com/llvm/llvm-project/pull/130054
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits