[llvm-branch-commits] [mlir] [mlir][Transforms] Dialect conversion: Fix missing source materialization (PR #97903)

2024-07-13 Thread Matthias Springer via llvm-branch-commits

https://github.com/matthias-springer edited 
https://github.com/llvm/llvm-project/pull/97903
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][Transforms] Dialect conversion: Fix missing source materialization (PR #97903)

2024-07-13 Thread Matthias Springer via llvm-branch-commits

https://github.com/matthias-springer edited 
https://github.com/llvm/llvm-project/pull/97903
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][Transforms] Dialect conversion: Simplify handling of dropped arguments (PR #97213)

2024-07-13 Thread Matthias Springer via llvm-branch-commits

https://github.com/matthias-springer edited 
https://github.com/llvm/llvm-project/pull/97213
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][Transforms] Dialect conversion: Simplify handling of dropped arguments (PR #97213)

2024-07-13 Thread Matthias Springer via llvm-branch-commits

https://github.com/matthias-springer edited 
https://github.com/llvm/llvm-project/pull/97213
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][Transforms] Dialect conversion: Fix missing source materialization (PR #97903)

2024-07-13 Thread Matthias Springer via llvm-branch-commits

https://github.com/matthias-springer updated 
https://github.com/llvm/llvm-project/pull/97903

>From 577317640c0f5800650157f5c4c6fbe6220eba38 Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Sat, 6 Jul 2024 14:28:41 +0200
Subject: [PATCH] fix test

---
 mlir/docs/DialectConversion.md|  3 +-
 .../SCF/TransformOps/SCFTransformOps.td   | 11 +++
 .../mlir/Transforms/DialectConversion.h   | 10 +--
 .../Conversion/LLVMCommon/TypeConverter.cpp   | 28 --
 .../Dialect/SCF/TransformOps/CMakeLists.txt   |  1 +
 .../SCF/TransformOps/SCFTransformOps.cpp  | 13 ++-
 .../Transforms/Utils/DialectConversion.cpp| 87 ++-
 .../FuncToLLVM/func-memref-return.mlir|  4 +-
 .../Transforms/test-block-legalization.mlir   | 44 ++
 9 files changed, 141 insertions(+), 60 deletions(-)
 create mode 100644 mlir/test/Transforms/test-block-legalization.mlir

diff --git a/mlir/docs/DialectConversion.md b/mlir/docs/DialectConversion.md
index db26e6477d5fc..23e74470a835f 100644
--- a/mlir/docs/DialectConversion.md
+++ b/mlir/docs/DialectConversion.md
@@ -352,7 +352,8 @@ class TypeConverter {
 
   /// This method registers a materialization that will be called when
   /// converting (potentially multiple) block arguments that were the result of
-  /// a signature conversion of a single block argument, to a single SSA value.
+  /// a signature conversion of a single block argument, to a single SSA value
+  /// with the old argument type.
   template ::template 
arg_t<1>>
   void addArgumentMaterialization(FnT &&callback) {
diff --git a/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td 
b/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td
index 7bf914f6456ce..20880d94a83ca 100644
--- a/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td
+++ b/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td
@@ -38,6 +38,17 @@ def ApplySCFStructuralConversionPatternsOp : 
Op]> {
+  let description = [{
+Collects patterns that lower structured control flow ops to unstructured
+control flow.
+  }];
+
+  let assemblyFormat = "attr-dict";
+}
+
 def Transform_ScfForOp : Transform_ConcreteOpType<"scf.for">;
 
 def ForallToForOp : Op>::template arg_t<1>>
   void addArgumentMaterialization(FnT &&callback) {
diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp 
b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index f5620a6a7cd91..32d02d5e438bd 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -153,9 +153,11 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
type.isVarArg());
   });
 
-  // Materialization for memrefs creates descriptor structs from individual
-  // values constituting them, when descriptors are used, i.e. more than one
-  // value represents a memref.
+  // Argument materializations convert from the new block argument types
+  // (multiple SSA values that make up a memref descriptor) back to the
+  // original block argument type. The dialect conversion framework will then
+  // insert a target materialization from the original block argument type to
+  // a legal type.
   addArgumentMaterialization(
   [&](OpBuilder &builder, UnrankedMemRefType resultType, ValueRange inputs,
   Location loc) -> std::optional {
@@ -164,12 +166,18 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
   // memref descriptor cannot be built just from a bare pointer.
   return std::nullopt;
 }
-return UnrankedMemRefDescriptor::pack(builder, loc, *this, resultType,
-  inputs);
+Value desc = UnrankedMemRefDescriptor::pack(builder, loc, *this,
+resultType, inputs);
+// An argument materialization must return a value of type
+// `resultType`, so insert a cast from the memref descriptor type
+// (!llvm.struct) to the original memref type.
+return builder.create(loc, resultType, 
desc)
+.getResult(0);
   });
   addArgumentMaterialization([&](OpBuilder &builder, MemRefType resultType,
  ValueRange inputs,
  Location loc) -> std::optional {
+Value desc;
 if (inputs.size() == 1) {
   // This is a bare pointer. We allow bare pointers only for function entry
   // blocks.
@@ -180,10 +188,16 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
   if (!block->isEntryBlock() ||
   !isa(block->getParentOp()))
 return std::nullopt;
-  return MemRefDescriptor::fromStaticShape(builder, loc, *this, resultType,
+  desc = MemRefDescriptor::fromStaticShape(builder, loc, *this, resultType,
inputs[0]);
+} else {
+  desc = MemRefDescriptor::pack(builder, loc, *this, resultType, inputs);

[llvm-branch-commits] [mlir] [mlir][Transforms] Dialect conversion: Simplify handling of dropped arguments (PR #97213)

2024-07-13 Thread Matthias Springer via llvm-branch-commits

https://github.com/matthias-springer updated 
https://github.com/llvm/llvm-project/pull/97213

>From b0b781318e5f76f28b9340efbdcc3f91c2c1cf31 Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Sat, 13 Jul 2024 17:36:37 +0200
Subject: [PATCH] [mlir][Transforms] Dialect conversion: Simplify handling of
 dropped arguments

This commit simplifies the handling of dropped arguments and updates some 
dialect conversion documentation that is outdated.

When converting a block signature, a BlockTypeConversionRewrite object and 
potentially multiple ReplaceBlockArgRewrite are created. During the "commit" 
phase, uses of the old block arguments are replaced with the new block 
arguments, but the old implementation was written in an inconsistent way: some 
block arguments were replaced in BlockTypeConversionRewrite::commit and some 
were replaced in ReplaceBlockArgRewrite::commit. The new
BlockTypeConversionRewrite::commit implementation is much simpler and no longer 
modifies any IR; that is done only in ReplaceBlockArgRewrite now. The 
ConvertedArgInfo data structure is no longer needed.

To that end, materializations of dropped arguments are now built in 
applySignatureConversion instead of materializeLiveConversions; the latter 
function no longer has to deal with dropped arguments.

Other minor improvements:

Improve variable name: origOutputType -> origArgType. Add an assertion to check 
that this field is only used for argument materializations.
Add more comments to applySignatureConversion.
Note: Error messages around failed materializations for dropped basic block 
arguments changed slightly. That is because those materializations are now 
built in legalizeUnresolvedMaterialization instead of 
legalizeConvertedArgumentTypes.

This commit is in preparation of decoupling argument/source/target 
materializations from the dialect conversion.

This is a re-upload of #96207.
---
 .../Transforms/Utils/DialectConversion.cpp| 173 ++
 .../test-legalize-type-conversion.mlir|   6 +-
 2 files changed, 57 insertions(+), 122 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp 
b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 1e0afee2373a9..0b552a7e1ca3b 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -432,34 +432,14 @@ class MoveBlockRewrite : public BlockRewrite {
   Block *insertBeforeBlock;
 };
 
-/// This structure contains the information pertaining to an argument that has
-/// been converted.
-struct ConvertedArgInfo {
-  ConvertedArgInfo(unsigned newArgIdx, unsigned newArgSize,
-   Value castValue = nullptr)
-  : newArgIdx(newArgIdx), newArgSize(newArgSize), castValue(castValue) {}
-
-  /// The start index of in the new argument list that contains arguments that
-  /// replace the original.
-  unsigned newArgIdx;
-
-  /// The number of arguments that replaced the original argument.
-  unsigned newArgSize;
-
-  /// The cast value that was created to cast from the new arguments to the
-  /// old. This only used if 'newArgSize' > 1.
-  Value castValue;
-};
-
 /// Block type conversion. This rewrite is partially reflected in the IR.
 class BlockTypeConversionRewrite : public BlockRewrite {
 public:
-  BlockTypeConversionRewrite(
-  ConversionPatternRewriterImpl &rewriterImpl, Block *block,
-  Block *origBlock, SmallVector, 1> 
argInfo,
-  const TypeConverter *converter)
+  BlockTypeConversionRewrite(ConversionPatternRewriterImpl &rewriterImpl,
+ Block *block, Block *origBlock,
+ const TypeConverter *converter)
   : BlockRewrite(Kind::BlockTypeConversion, rewriterImpl, block),
-origBlock(origBlock), argInfo(argInfo), converter(converter) {}
+origBlock(origBlock), converter(converter) {}
 
   static bool classof(const IRRewrite *rewrite) {
 return rewrite->getKind() == Kind::BlockTypeConversion;
@@ -479,10 +459,6 @@ class BlockTypeConversionRewrite : public BlockRewrite {
   /// The original block that was requested to have its signature converted.
   Block *origBlock;
 
-  /// The conversion information for each of the arguments. The information is
-  /// std::nullopt if the argument was dropped during conversion.
-  SmallVector, 1> argInfo;
-
   /// The type converter used to convert the arguments.
   const TypeConverter *converter;
 };
@@ -691,12 +667,16 @@ class CreateOperationRewrite : public OperationRewrite {
 /// The type of materialization.
 enum MaterializationKind {
   /// This materialization materializes a conversion for an illegal block
-  /// argument type, to a legal one.
+  /// argument type, to the original one.
   Argument,
 
   /// This materialization materializes a conversion from an illegal type to a
   /// legal one.
-  Target
+  Target,
+
+  /// This materialization materializes a conversion from a legal type back to
+  /// an illegal one.
+  Source
 };
 
 

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from {global|flat}_atomic_fadd_v2f16 builtins (PR #96873)

2024-07-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96873

>From 1d92ea8f8ee953a654f4de02f0d9a9596148bcf1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:12:59 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from
 {global|flat}_atomic_fadd_v2f16 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 20 ++-
 .../builtins-fp-atomics-gfx12.cl  |  9 ++---
 .../builtins-fp-atomics-gfx90a.cl |  2 +-
 .../builtins-fp-atomics-gfx940.cl |  3 ++-
 4 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3cf708da269b3..105566b584108 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18659,22 +18659,15 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -18694,11 +18687,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ArgTy = llvm::Type::getFloatTy(getLLVMContext());
   IID = Intrinsic::amdgcn_flat_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -19091,7 +19079,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_fminf:
   case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19109,6 +19099,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 6b8a6d14575db..07e63a8711c7f 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -48,7 +48,8 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_flat_add_2f16
-// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr 
%{{.*}}, <2 x half> %{{.*}})
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} 
syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX12-LABEL:  test_flat_add_2f16
 // GFX12: flat_atomic_pk_add_f16
 half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
@@ -64,7 +65,8 @@ short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
 }
 
 // CHECK-LABEL: test_global_add_half2
-// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr 
addrspace(1) %{{.*}}, <2 x half> %{{.*}})
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> 
%{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
+
 // GFX12-LABEL:  test_global_add_half2
 // GFX12:  global_atomic_pk_add_f16 v2, v[0:1], v2, off

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-07-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96872

>From 4168832ad877849e6dcb2a65d7a3f966546b14dc Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 11 Jun 2024 10:58:44 +0200
Subject: [PATCH 1/2] clang/AMDGPU: Emit atomicrmw for
 __builtin_amdgcn_global_atomic_fadd_{f32|f64}

Need to emit syncscope and new metadata to get the native instruction,
most of the time.
---
 clang/lib/CodeGen/CGBuiltin.cpp   | 39 +--
 .../CodeGenOpenCL/builtins-amdgcn-gfx11.cl|  2 +-
 .../builtins-fp-atomics-gfx12.cl  |  4 +-
 .../builtins-fp-atomics-gfx90a.cl |  4 +-
 .../builtins-fp-atomics-gfx940.cl |  4 +-
 5 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6cc0d9485720c..15d5e80fcd402 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MatrixBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -18658,8 +18659,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18671,18 +18670,11 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   ArgTy = llvm::FixedVectorType::get(
   llvm::Type::getHalfTy(getLLVMContext()), 2);
   IID = Intrinsic::amdgcn_global_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -19097,7 +19089,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
   case AMDGPU::BI__builtin_amdgcn_ds_faddf:
   case AMDGPU::BI__builtin_amdgcn_ds_fminf:
-  case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
+  case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19113,6 +19107,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19147,8 +19143,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
   EmitScalarExpr(E->getArg(3)), AO, SSID);
 } else {
-  // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
-  SSID = llvm::SyncScope::System;
+  // Most of the builtins do not have syncscope/order arguments. For DS
+  // atomics the scope doesn't really matter, as they implicitly operate at
+  // workgroup scope.
+  //
+  // The global/flat cases need to use agent scope to consistently produce
+  // the native instruction instead of a cmpxchg expansion.
+  SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
   AO = AtomicOrdering::SequentiallyConsistent;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19163,6 +19164,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
 if (Volatile)
   RMW->setVolatile(true);
+
+unsigned AddrSpace = Ptr.getType()->getAddressSpace();
+if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
+  // Most targets require "amdgpu.no.fine.grained.memory" to emit the 
nativ

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64} builtins (PR #96874)

2024-07-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96874

>From de71919da0495132ac0bc9d4b8122af4f000191f Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:15:26 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64}
 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp | 17 ++---
 .../CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl |  6 --
 .../CodeGenOpenCL/builtins-fp-atomics-gfx940.cl |  3 ++-
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 105566b584108..91702246c9571 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18661,10 +18661,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   }
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
@@ -18674,19 +18672,12 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmax;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmin;
   break;
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmax;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -19081,7 +19072,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19101,6 +19094,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index cd10777dbe079..02e289427238f 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -45,7 +45,8 @@ void test_global_max_f64(__global double *addr, double x){
 }
 
 // CHECK-LABEL: test_flat_add_local_f64
-// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr 
addrspace(3) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8{{$}}
+
 // GFX90A-LABEL:  test_flat_add_local_f64$local
 // GFX90A:  ds_add_rtn_f64
 void test_flat_add_local_f64(__local double *addr, double x){
@@ -54,7 +55,8 @@ void test_flat_add_local_f64(__local double *addr, double x){
 }
 
 // CHECK-LABEL: test_flat_global_add_f64
-// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_flat_global_add_f64$local
 // GFX90A:  global_atomic_add_f64
 void test_flat_global_add_f64(__global double *addr, double x){
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
index 589dcd406630d..bd9b8c7268e06 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
@@ -10,7 +10,8 @@ typedef half  _

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins (PR #96875)

2024-07-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96875

>From 948268ee720b7c10d10825653422fba3cb2f53d7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:34:43 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16
 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 26 ++-
 .../builtins-fp-atomics-gfx12.cl  | 24 -
 .../builtins-fp-atomics-gfx90a.cl |  6 ++---
 .../builtins-fp-atomics-gfx940.cl | 14 +++---
 4 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 91702246c9571..6c97b1d1108a5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18685,22 +18685,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
 return Builder.CreateCall(F, {Addr, Val});
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
-Intrinsic::ID IID;
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
-  IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19074,7 +19058,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19096,6 +19082,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19140,7 +19128,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   AO = AtomicOrdering::Monotonic;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
-  if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
+  if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
+  BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
+  BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
 llvm::Type *V2BF16Ty = FixedVectorType::get(
 llvm::Type::getBFloatTy(Builder.getContext()), 2);
 Val = Builder.CreateBitCast(Val, V2BF16Ty);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 07e63a8711c7f..e8b6eb57c38d7 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
 
 // CHECK-LABEL: test_local_add_2bf16
 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> 
[[BC0]] syncscope("agent") monotonic, align 4
+// CHECK-NEXT: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x 
bfloat> [[BC0]] syncscope("agent") monotonic, align 4
 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
 
 // GFX12-LABEL:  test_local_add_2bf16
@@ -48,7 +48,7 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_flat_add_2f16
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} 
syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} 
syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
 
 // GFX12-LABEL:  test_flat_add_2f

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max f64 builtins (PR #96876)

2024-07-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96876

>From cbd9e1fcea9ee2f53192fc0a4b9e4dead78ab1bf Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 23:18:32 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max
 f64 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 36 +--
 .../builtins-fp-atomics-gfx90a.cl | 18 ++
 2 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6c97b1d1108a5..42d57671f9c76 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18659,32 +18659,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
-Intrinsic::ID IID;
-llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmax;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmax;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F =
-CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19060,7 +19034,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19087,8 +19065,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   BinOp = llvm::AtomicRMWInst::FMin;
   break;
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
   BinOp = llvm::AtomicRMWInst::FMax;
   break;
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index 9381ce951df3e..556e553903d1a 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -27,7 +27,8 @@ void test_global_add_half2(__global half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_global_global_min_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmin ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_global_min_f64$local
 // GFX90A:  global_atomic_min_f64
 void test_global_global_min_f64(__global double *addr, double x){
@@ -36,7 +37,8 @@ void test_global_global_min_f64(__global double *addr, double 
x){
 }
 
 // CHECK-LABEL: test_global_max_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmax ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_max_f64$local
 // GFX90A:  global_atomic_max_f64
 void test_global_max_f64(__global double *addr, double x){
@@ -65,7 +67,8 @@ void test_flat_global_add_f64(__global double *addr, doub

[llvm-branch-commits] [llvm] AMDGPU: Remove flat/global atomic fadd v2bf16 intrinsics (PR #97050)

2024-07-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/97050

>From d9dbd49a454a6793afca51474efc836a1d2095b7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Thu, 27 Jun 2024 16:32:48 +0200
Subject: [PATCH] AMDGPU: Remove flat/global atomic fadd v2bf16 intrinsics

These are now fully covered by atomicrmw.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   4 -
 llvm/lib/IR/AutoUpgrade.cpp   |  14 +-
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |   2 -
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   2 -
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   2 -
 llvm/lib/Target/AMDGPU/FLATInstructions.td|   2 -
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |   6 +-
 llvm/test/Bitcode/amdgcn-atomic.ll|  22 ++
 .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll| 106 -
 .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 210 --
 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 193 
 11 files changed, 33 insertions(+), 530 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 71b1e832bde3c..9cf4d6352d23d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2907,10 +2907,6 @@ multiclass AMDGPUMFp8SmfmacIntrinsic {
 def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic;
 }
 
-// bf16 atomics use v2i16 argument since there is no bf16 data type in the 
llvm.
-def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn;
-def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn;
-
 defset list AMDGPUMFMAIntrinsics940 = {
 def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic;
 def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 53de9eef516b3..f566a0e3c3043 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1034,7 +1034,9 @@ static bool upgradeIntrinsicFunction1(Function *F, 
Function *&NewFn,
   }
 
   if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
-  Name.starts_with("ds.fmax")) {
+  Name.starts_with("ds.fmax") ||
+  Name.starts_with("global.atomic.fadd.v2bf16") ||
+  Name.starts_with("flat.atomic.fadd.v2bf16")) {
 // Replaced with atomicrmw fadd/fmin/fmax, so there's no new
 // declaration.
 NewFn = nullptr;
@@ -4042,7 +4044,9 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, 
CallBase *CI,
   .StartsWith("ds.fmin", AtomicRMWInst::FMin)
   .StartsWith("ds.fmax", AtomicRMWInst::FMax)
   .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
-  .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
+  .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap)
+  .StartsWith("global.atomic.fadd", AtomicRMWInst::FAdd)
+  .StartsWith("flat.atomic.fadd", AtomicRMWInst::FAdd);
 
   unsigned NumOperands = CI->getNumOperands();
   if (NumOperands < 3) // Malformed bitcode.
@@ -4097,8 +4101,10 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, 
CallBase *CI,
   Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID);
 
   if (PtrTy->getAddressSpace() != 3) {
-RMW->setMetadata("amdgpu.no.fine.grained.memory",
- MDNode::get(F->getContext(), {}));
+MDNode *EmptyMD = MDNode::get(F->getContext(), {});
+RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
+if (RMWOp == AtomicRMWInst::FAdd && RetTy->isFloatTy())
+  RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
   }
 
   if (IsVolatile)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index c6dbc58395e48..db8b44149cf47 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -620,12 +620,10 @@ multiclass local_addr_space_atomic_op {
 
 defm int_amdgcn_flat_atomic_fadd : noret_op;
 defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op;
-defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op;
 defm int_amdgcn_flat_atomic_fmin : noret_op;
 defm int_amdgcn_flat_atomic_fmax : noret_op;
 defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op;
 defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op;
-defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op;
 defm int_amdgcn_global_atomic_fmin : noret_op;
 defm int_amdgcn_global_atomic_fmax : noret_op;
 defm int_amdgcn_global_atomic_csub : noret_op;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 9e7694f41d6b8..74686cd10512c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4897,8 +4897,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const 
MachineInstr &MI) const {
 case Intrinsic::amdgcn_flat_atomic_fmax:
 case Intrinsic