[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96443

>From f29955ee4dfb3319d0ea99187d2cc24587c9e716 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 16:44:08 +0200
Subject: [PATCH 1/3] AMDGPU: Add subtarget feature for global atomic fadd
 denormal support

Not sure what the behavior for gfx90a is. The SPG says it always flushes.
The instruction documentation says it does not.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 14 --
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  7 +++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 56ec5e9c4cfc2..6b212e1b2af03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureMemoryAtomicFaddF32DenormalSupport
+  : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
+  "HasAtomicMemoryAtomicFaddF32DenormalSupport",
+  "true",
+  "global/flat/buffer atomic fadd for float supports denormal handling"
+>;
+
 def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
   "HasAgentScopeFineGrainedRemoteMemoryAtomics",
@@ -1427,7 +1434,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
-   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1631,7 +1639,9 @@ def FeatureISAVersion12 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
-   Feature1_5xVGPRs]>;
+   Feature1_5xVGPRs,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   ]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 9e2a316a9ed28..db0b2b67a0388 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicFlatPkAdd16Insts = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
+  bool HasAtomicMemoryAtomicFaddF32DenormalSupport = false;
   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
@@ -872,6 +873,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if the target's flat, global, and buffer atomic fadd for
+  /// float supports denormal handling.
+  bool hasMemoryAtomicFaddF32DenormalSupport() const {
+return HasAtomicMemoryAtomicFaddF32DenormalSupport;
+  }
+
   /// \return true if atomic operations targeting fine-grained memory work
   /// correctly at device scope, in allocations in host or peer PCIe device
   /// memory.

>From 2eed4079db56e5983166e5fb0e55ae3d80594f19 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 24 Jun 2024 12:10:37 +0200
Subject: [PATCH 2/3] Add to gfx11.

RDNA 3 manual says "Floating-point addition handles NAN/INF/denorm"
thought I'm not sure I trust it.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 6b212e1b2af03..39a1d629a4aea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1547,7 +1547,8 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeaturePackedTID,
-   FeatureVcmpxPermlaneHazard]>;
+   FeatureVcmpxPermlaneHazard,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen
@@ -1640,7 +1641,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
Feature1_5xVGPRs,
-   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<

>From b57b67e84ee062f993d39fc33d248d2cb73e2c6a Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 11:30:51 +0200
Subject: [PATCH 3/3] Rename

---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 10 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 39a1d629a4aea..34c6f6ff19bff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -78

[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96444

>From 36cbbdfaa31c6313c96a9c908bade1e6f7debc5b Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 17:07:53 +0200
Subject: [PATCH] AMDGPU: Add subtarget feature for memory atomic fadd f64

---
 llvm/lib/Target/AMDGPU/AMDGPU.td   | 21 ++---
 llvm/lib/Target/AMDGPU/BUFInstructions.td  | 10 ++
 llvm/lib/Target/AMDGPU/FLATInstructions.td |  6 +++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h  | 10 +++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  2 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 34c6f6ff19bff..84ea040477763 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureFlatBufferGlobalAtomicFaddF64Inst
+  : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
+  "HasFlatBufferGlobalAtomicFaddF64Inst",
+  "true",
+  "Has flat, buffer, and global instructions for f64 atomic fadd"
+>;
+
 def FeatureMemoryAtomicFAddF32DenormalSupport
   : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
   "HasMemoryAtomicFaddF32DenormalSupport",
@@ -1390,7 +1397,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
  FeatureBackOffBarrier,
  FeatureKernargPreload,
  FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureFlatBufferGlobalAtomicFaddF64Inst
  ])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
@@ -1435,7 +1443,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
-   FeatureMemoryAtomicFAddF32DenormalSupport
+   FeatureMemoryAtomicFAddF32DenormalSupport,
+   FeatureFlatBufferGlobalAtomicFaddF64Inst
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1932,11 +1941,9 @@ def isGFX12Plus :
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
-
-def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
-  Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
-  // FIXME: This is too coarse, and working around using pseudo's predicates 
on real instruction.
-  AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, 
FeatureSouthernIslands, FeatureSeaIslands)>;
+def HasFlatBufferGlobalAtomicFaddF64Inst :
+  Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">,
+  AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>;
 
 def HasAtomicFMinFMaxF32GlobalInsts :
   Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 3b8d94b744000..a904c8483dbf5 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1312,14 +1312,16 @@ let SubtargetPredicate = isGFX90APlus in {
   }
 } // End SubtargetPredicate = isGFX90APlus
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", 
VReg_64, f64>;
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
   // depending on some subtargets.
   defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", 
VReg_64, f64>;
   defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", 
VReg_64, f64>;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+}
 
 def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
   let SubtargetPredicate = isGFX940Plus;
@@ -1836,9 +1838,9 @@ let SubtargetPredicate = 
HasAtomicBufferGlobalPkAddF16Insts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, 
"BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
 } // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, 
"BUFFER_ATOMIC_ADD_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
 let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_MIN_F64">;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 98054dde398b3..89946a4719557 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td

[llvm-branch-commits] [mlir] [mlir][linalg] Decompose winograd operators (PR #96183)

2024-06-27 Thread Hsiangkai Wang via llvm-branch-commits


@@ -23,6 +26,156 @@ namespace linalg {
 
 namespace {
 
+// clang-format off
+// Winograd Conv2D uses a minimal 2D filtering algorithm to calculate its
+// result. The formula of minimal 2D filtering algorithm F(m x m, r x r),
+// m is the output dimension and r is the filter dimension, is
+//
+// Y = A^T x [ (G x g x G^T) x (B^T x d x B) ] x A
+//
+// g is filter and d is input data. We need to prepare 6 constant
+// transformation matrices, G, G^T, B^T, B, A^T, and A for this formula.
+//
+// The following tables define these constant transformation matrices for
+// F(2 x 2, 3 x 3), F(4 x 4, 3 x 3), and F(2 x 2, 5 x 5)
+constexpr float G_2x2_3x3[] = {
+   -1, 0,   0,
+ 1./2, -1./2, 1./2,
+ 1./2,  1./2, 1./2,
+0, 0,1
+};
+
+constexpr float GT_2x2_3x3[] = {
+   -1,  1./2, 1./2, 0,
+0, -1./2, 1./2, 0,
+0,  1./2, 1./2, 1
+};

Hsiangkai wrote:

Can you elaborate it a bit more? I am not sure what the idea is here. Thank you.

https://github.com/llvm/llvm-project/pull/96183
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][linalg] Decompose winograd operators (PR #96183)

2024-06-27 Thread Hsiangkai Wang via llvm-branch-commits


@@ -36,6 +189,92 @@ constexpr TransformMapKeyTy F_2_3{2, 3};
 constexpr TransformMapKeyTy F_4_3{4, 3};
 constexpr TransformMapKeyTy F_2_5{2, 5};
 
+struct TransformMatrix {

Hsiangkai wrote:

Done.

https://github.com/llvm/llvm-project/pull/96183
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][linalg] Decompose winograd operators (PR #96183)

2024-06-27 Thread Hsiangkai Wang via llvm-branch-commits


@@ -36,6 +189,92 @@ constexpr TransformMapKeyTy F_2_3{2, 3};
 constexpr TransformMapKeyTy F_4_3{4, 3};
 constexpr TransformMapKeyTy F_2_5{2, 5};
 
+struct TransformMatrix {
+  TransformMatrix(const float *table, int64_t rows, int64_t cols,
+  int64_t scalarFactor = 1)
+  : table(table), rows(rows), cols(cols), scalarFactor(scalarFactor) {}
+
+  const float *table;
+  int64_t rows;
+  int64_t cols;
+  int64_t scalarFactor;
+};
+
+Value create2DTransformMatrix(RewriterBase &rewriter, Location loc,
+  TransformMatrix transform, Type type) {
+  ArrayRef const_vec(transform.table, transform.rows * transform.cols);

Hsiangkai wrote:

Fixed.

https://github.com/llvm/llvm-project/pull/96183
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][linalg] Decompose winograd operators (PR #96183)

2024-06-27 Thread Hsiangkai Wang via llvm-branch-commits


@@ -48,6 +287,261 @@ Value collapse2DData(RewriterBase &rewriter, Location loc, 
Value data) {
   reassociation);
 }
 
+// This function transforms the filter. The data layout of the filter is FHWC.
+// The transformation matrix is 2-dimension. We need to extract H x W from
+// FHWC first. We need to generate 2 levels of loops to iterate on F and C.
+// After the transformation, we get
+//
+// scf.for %f = lo_f to hi_f step 1
+//   scf.for %c = lo_c to hi_c step 1
+// %extracted = extract filter from filter
+// %ret = linalg.matmul G, %extracted
+// %ret = linalg.matmul %ret, GT
+// %inserted = insert %ret into filter
+//

Hsiangkai wrote:

Fixed.

https://github.com/llvm/llvm-project/pull/96183
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][linalg] Decompose winograd operators (PR #96183)

2024-06-27 Thread Hsiangkai Wang via llvm-branch-commits


@@ -48,6 +287,261 @@ Value collapse2DData(RewriterBase &rewriter, Location loc, 
Value data) {
   reassociation);
 }
 
+// This function transforms the filter. The data layout of the filter is FHWC.
+// The transformation matrix is 2-dimension. We need to extract H x W from
+// FHWC first. We need to generate 2 levels of loops to iterate on F and C.
+// After the transformation, we get
+//
+// scf.for %f = lo_f to hi_f step 1
+//   scf.for %c = lo_c to hi_c step 1
+// %extracted = extract filter from filter
+// %ret = linalg.matmul G, %extracted
+// %ret = linalg.matmul %ret, GT
+// %inserted = insert %ret into filter
+//
+Value filterTransform(RewriterBase &rewriter, Location loc, Value filter,
+  Value retValue, int64_t m, int64_t r,
+  bool leftTransform = true, bool rightTransform = true) {
+  // Map from (m, r) to G transform matrix.
+  static const llvm::SmallDenseMap
+  GMatrices = {
+  {F_2_3, TransformMatrix(G_2x2_3x3, 4, 3)},
+  {F_4_3, TransformMatrix(G_4x4_3x3, 6, 3)},
+  {F_2_5, TransformMatrix(G_2x2_5x5, 6, 5)},
+  };
+
+  // Map from (m, r) to GT transform matrix.
+  static const llvm::SmallDenseMap
+  GTMatrices = {
+  {F_2_3, TransformMatrix(GT_2x2_3x3, 3, 4)},
+  {F_4_3, TransformMatrix(GT_4x4_3x3, 3, 6)},
+  {F_2_5, TransformMatrix(GT_2x2_5x5, 5, 6)},
+  };
+
+  auto filterType = cast(filter.getType());
+  Type elementType = filterType.getElementType();
+  auto filterShape = filterType.getShape(); // F, H, W, C
+  int64_t filterF = filterShape[0];
+  int64_t filterH = filterShape[1];
+  int64_t filterW = filterShape[2];
+  int64_t filterC = filterShape[3];
+
+  if (filterH != r && filterH != 1)
+return Value();
+  if (filterW != r && filterW != 1)
+return Value();
+
+  // Return shape is 
+  auto zeroIdx = rewriter.create(loc, 0);
+  auto fUpperBound = rewriter.create(loc, filterF);
+  auto cUpperBound = rewriter.create(loc, filterC);
+  auto oneStep = rewriter.create(loc, 1);
+  auto outerForOp =
+  rewriter.create(loc, zeroIdx, fUpperBound, oneStep, 
retValue);
+  Block *outerForBody = outerForOp.getBody();
+  rewriter.setInsertionPointToStart(outerForBody);
+  Value FIter = outerForBody->getArgument(0);

Hsiangkai wrote:

I use buildLoopNest to create loops and use a callback to construct inner most 
loop body.

https://github.com/llvm/llvm-project/pull/96183
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][linalg] Decompose winograd operators (PR #96183)

2024-06-27 Thread Hsiangkai Wang via llvm-branch-commits


@@ -100,6 +594,161 @@ Value matrixMultiply(RewriterBase &rewriter, Location loc,
   return expandOutput;
 }
 
+// This function transforms the output. The data layout of the output is HWNF.
+// The transformation matrix is 2-dimension. We need to extract H x W from
+// HWNF first. We need to generate 2 levels of loops to iterate on N and F.
+// After the transformation, we get
+//
+// scf.for %n = lo_n to hi_n step 1
+//   scf.for %f = lo_f to hi_f step 1
+// %extracted = extract input from result
+// %ret = linalg.matmul AT, %extracted
+// %ret = linalg.matmul %ret, A
+// %inserted = insert %ret into ret
+//
+Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
+  Value output, int64_t m, int64_t r,
+  bool leftTransform = true, bool rightTransform = true) {
+  // Map from (m, r) to AT transform matrix.
+  static const llvm::SmallDenseMap
+  ATMatrices = {
+  {F_2_3, TransformMatrix(AT_2x2_3x3, 2, 4)},
+  {F_4_3, TransformMatrix(AT_4x4_3x3, 4, 6, 32)},
+  {F_2_5, TransformMatrix(AT_2x2_5x5, 2, 6, 16)},
+  };
+
+  // Map from (m, r) to A transform matrix.
+  static const llvm::SmallDenseMap
+  AMatrices = {
+  {F_2_3, TransformMatrix(A_2x2_3x3, 4, 2)},
+  {F_4_3, TransformMatrix(A_4x4_3x3, 6, 4, 32)},
+  {F_2_5, TransformMatrix(A_2x2_5x5, 6, 2, 16)},
+  };
+
+  auto valueType = cast(value.getType());
+  Type elementType = valueType.getElementType();
+  auto valueShape = valueType.getShape(); // TileH, TileW, H, W, N, F
+  int64_t valueH = valueShape[2];
+  int64_t valueW = valueShape[3];
+  int64_t valueN = valueShape[4];
+  int64_t valueF = valueShape[5];
+  int64_t alphaH = leftTransform ? m + r - 1 : 1;
+  int64_t alphaW = rightTransform ? m + r - 1 : 1;
+
+  if (valueH != alphaH && valueH != 1)
+return Value();
+  if (valueW != alphaW && valueW != 1)
+return Value();
+
+  auto zeroIdx = rewriter.create(loc, 0);
+  auto nUpperBound = rewriter.create(loc, valueN);
+  auto fUpperBound = rewriter.create(loc, valueF);
+  auto oneStep = rewriter.create(loc, 1);
+
+  auto outerForOp =
+  rewriter.create(loc, zeroIdx, nUpperBound, oneStep, output);
+  Block *outerForBody = outerForOp.getBody();
+  rewriter.setInsertionPointToStart(outerForBody);
+  Value NIter = outerForBody->getArgument(0);
+
+  auto innerForOp = rewriter.create(
+  loc, zeroIdx, fUpperBound, oneStep, outerForOp.getRegionIterArgs()[0]);
+  Block *innerForBody = innerForOp.getBody();
+  rewriter.setInsertionPointToStart(innerForBody);
+  Value FIter = innerForBody->getArgument(0);

Hsiangkai wrote:

Done.

https://github.com/llvm/llvm-project/pull/96183
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][linalg] Decompose winograd operators (PR #96183)

2024-06-27 Thread Hsiangkai Wang via llvm-branch-commits


@@ -289,6 +938,123 @@ FailureOr winogradConv2DHelper(RewriterBase 
&rewriter,
   return transformedOutput.getDefiningOp();
 }
 
+FailureOr
+decomposeWinogradFilterTransformHelper(RewriterBase &rewriter,
+   linalg::WinogradFilterTransformOp op) {
+  Location loc = op.getLoc();
+  Value filter = op.getFilter();
+  auto filterType = cast(filter.getType());
+  auto filterShape = filterType.getShape();
+  int64_t filterH = filterShape[1];
+  int64_t filterW = filterShape[2];
+
+  // For F(m x 1, r x 1), we only need to do left side transform.
+  bool leftTransform = filterH != 1;
+  // For F(1 x m, 1 x r), we only need to do right side transform.
+  bool rightTransform = filterW != 1;
+  Value transformedFilter =
+  filterTransform(rewriter, loc, filter, op.getOutput(), op.getM(),
+  op.getR(), leftTransform, rightTransform);
+  if (!transformedFilter)
+return failure();
+
+  rewriter.replaceOp(op, transformedFilter);
+
+  return transformedFilter.getDefiningOp();
+}
+
+FailureOr
+decomposeWinogradInputTransformHelper(RewriterBase &rewriter,
+  linalg::WinogradInputTransformOp op) {
+  Location loc = op.getLoc();
+  Value input = op.getInput();
+  auto inputType = cast(input.getType());
+  auto inputShape = inputType.getShape();
+  int64_t inputH = inputShape[1];
+  int64_t inputW = inputShape[2];
+
+  // For F(m x 1, r x 1), we only need to do left side transform.
+  bool leftTransform = inputH != 1;
+  // For F(1 x m, 1 x r), we only need to do right side transform.
+  bool rightTransform = inputW != 1;
+  Value transformedInput =
+  inputTransform(rewriter, loc, op.getInput(), op.getOutput(), op.getM(),
+ op.getR(), leftTransform, rightTransform);
+  if (!transformedInput)
+return failure();
+
+  rewriter.replaceOp(op, transformedInput);
+
+  return transformedInput.getDefiningOp();
+}
+
+FailureOr
+decomposeWinogradOutputTransformHelper(RewriterBase &rewriter,
+   linalg::WinogradOutputTransformOp op) {
+  Location loc = op.getLoc();
+  Value value = op.getValue();
+  auto valueType = cast(value.getType());
+  auto valueShape = valueType.getShape();
+  int64_t valueH = valueShape[2];
+  int64_t valueW = valueShape[3];
+
+  // For F(m x 1, r x 1), we only need to do left side transform.
+  bool leftTransform = valueH != 1;
+  // For F(1 x m, 1 x r), we only need to do right side transform.
+  bool rightTransform = valueW != 1;
+  Value transformedOutput =
+  outputTransform(rewriter, loc, value, op.getOutput(), op.getM(),
+  op.getR(), leftTransform, rightTransform);
+  if (!transformedOutput)
+return failure();
+
+  rewriter.replaceOp(op, transformedOutput);
+
+  return transformedOutput.getDefiningOp();
+}
+
+class DecomposeWinogradFilterTransform final
+: public OpRewritePattern {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(linalg::WinogradFilterTransformOp op,
+PatternRewriter &rewriter) const override {
+if (failed(decomposeWinogradFilterTransformHelper(rewriter, op)))
+  return failure();
+
+return success();
+  }
+};
+
+class DecomposeWinogradInputTransform final
+: public OpRewritePattern {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(linalg::WinogradInputTransformOp op,
+PatternRewriter &rewriter) const override {
+if (failed(decomposeWinogradInputTransformHelper(rewriter, op)))
+  return failure();
+
+return success();

Hsiangkai wrote:

Done.

https://github.com/llvm/llvm-project/pull/96183
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][linalg] Decompose winograd operators (PR #96183)

2024-06-27 Thread Hsiangkai Wang via llvm-branch-commits


@@ -323,5 +1089,12 @@ void populateWinogradConv2DPatterns(RewritePatternSet 
&patterns, int64_t m,
   patterns.insert(context, m, r);
 }
 
+void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns) {
+  MLIRContext *context = patterns.getContext();
+  patterns.insert(context);
+  patterns.insert(context);
+  patterns.insert(context);

Hsiangkai wrote:

Done.

https://github.com/llvm/llvm-project/pull/96183
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Remove ds_fmin/ds_fmax intrinsics (PR #96739)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96739

>From 864e3bbfc5f40bfb1e87f7689ede0d5f33aa42da Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 11 Jun 2024 11:46:15 +0200
Subject: [PATCH] AMDGPU: Remove ds_fmin/ds_fmax intrinsics

These have been replaced with atomicrmw.
---
 llvm/docs/ReleaseNotes.rst|5 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   14 -
 llvm/lib/IR/AutoUpgrade.cpp   |8 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   32 -
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |3 -
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |2 -
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp  |   20 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |   15 +-
 llvm/test/Bitcode/amdgcn-atomic.ll|   52 +
 .../AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll  |  371 -
 .../AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll  |  279 
 .../CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll| 1418 -
 12 files changed, 65 insertions(+), 2154 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 416b3952f1ac4..ed7d252668850 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -132,6 +132,11 @@ Changes to the AMDGPU Backend
 
 * Implemented :ref:`llvm.get.rounding ` and 
:ref:`llvm.set.rounding `
 
+* Removed ``llvm.amdgcn.ds.fadd``, ``llvm.amdgcn.ds.fmin`` and
+  ``llvm.amdgcn.ds.fmax`` intrinsics. Users should use the
+  :ref:`atomicrmw ` instruction with `fadd`, `fmin` and
+  `fmax` with addrspace(3) instead.
+
 Changes to the ARM Backend
 --
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d040aa8f38278..71b1e832bde3c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -523,17 +523,6 @@ def int_amdgcn_fmad_ftz :
 [IntrNoMem, IntrSpeculatable]
 >;
 
-class AMDGPULDSIntrin :
-  Intrinsic<[llvm_any_ty],
-[LLVMQualPointerType<3>,
-LLVMMatchType<0>,
-llvm_i32_ty, // ordering
-llvm_i32_ty, // scope
-llvm_i1_ty], // isVolatile
-[IntrArgMemOnly, IntrWillReturn, NoCapture>,
- ImmArg>, ImmArg>, ImmArg>, 
IntrNoCallback, IntrNoFree]
->;
-
 // FIXME: The m0 argument should be moved after the normal arguments
 class AMDGPUDSOrderedIntrinsic : Intrinsic<
   [llvm_i32_ty],
@@ -571,9 +560,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
-def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
-def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
-
 } // TargetPrefix = "amdgcn"
 
 // New-style image intrinsics
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d7825d9b3e3e5..32076a07d30e7 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1033,8 +1033,10 @@ static bool upgradeIntrinsicFunction1(Function *F, 
Function *&NewFn,
 break; // No other 'amdgcn.atomic.*'
   }
 
-  if (Name.starts_with("ds.fadd")) {
-// Replaced with atomicrmw fadd, so there's no new declaration.
+  if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
+  Name.starts_with("ds.fmax")) {
+// Replaced with atomicrmw fadd/fmin/fmax, so there's no new
+// declaration.
 NewFn = nullptr;
 return true;
   }
@@ -2347,6 +2349,8 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, 
CallBase *CI,
   AtomicRMWInst::BinOp RMWOp =
   StringSwitch(Name)
   .StartsWith("ds.fadd", AtomicRMWInst::FAdd)
+  .StartsWith("ds.fmin", AtomicRMWInst::FMin)
+  .StartsWith("ds.fmax", AtomicRMWInst::FMax)
   .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
   .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index f1254b2e9e1d2..dc165d65fa6ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5401,35 +5401,6 @@ bool 
AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
   return true;
 }
 
-static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
-  switch (IID) {
-  case Intrinsic::amdgcn_ds_fmin:
-return AMDGPU::G_ATOMICRMW_FMIN;
-  case Intrinsic::amdgcn_ds_fmax:
-return AMDGPU::G_ATOMICRMW_FMAX;
-  default:
-llvm_unreachable("not a DS FP intrinsic");
-  }
-}
-
-bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
-  MachineInstr &MI,
- 

[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96443

>From eaa00157741d5e4f134df22ed27a80fe3d853e6e Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 16:44:08 +0200
Subject: [PATCH 1/3] AMDGPU: Add subtarget feature for global atomic fadd
 denormal support

Not sure what the behavior for gfx90a is. The SPG says it always flushes.
The instruction documentation says it does not.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 14 --
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  7 +++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 56ec5e9c4cfc2..6b212e1b2af03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureMemoryAtomicFaddF32DenormalSupport
+  : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
+  "HasAtomicMemoryAtomicFaddF32DenormalSupport",
+  "true",
+  "global/flat/buffer atomic fadd for float supports denormal handling"
+>;
+
 def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
   "HasAgentScopeFineGrainedRemoteMemoryAtomics",
@@ -1427,7 +1434,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
-   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1631,7 +1639,9 @@ def FeatureISAVersion12 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
-   Feature1_5xVGPRs]>;
+   Feature1_5xVGPRs,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   ]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 9e2a316a9ed28..db0b2b67a0388 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicFlatPkAdd16Insts = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
+  bool HasAtomicMemoryAtomicFaddF32DenormalSupport = false;
   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
@@ -872,6 +873,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if the target's flat, global, and buffer atomic fadd for
+  /// float supports denormal handling.
+  bool hasMemoryAtomicFaddF32DenormalSupport() const {
+return HasAtomicMemoryAtomicFaddF32DenormalSupport;
+  }
+
   /// \return true if atomic operations targeting fine-grained memory work
   /// correctly at device scope, in allocations in host or peer PCIe device
   /// memory.

>From 84c8e017f521236c51a75a275c24f87dc919fd4b Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 24 Jun 2024 12:10:37 +0200
Subject: [PATCH 2/3] Add to gfx11.

RDNA 3 manual says "Floating-point addition handles NAN/INF/denorm"
thought I'm not sure I trust it.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 6b212e1b2af03..39a1d629a4aea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1547,7 +1547,8 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeaturePackedTID,
-   FeatureVcmpxPermlaneHazard]>;
+   FeatureVcmpxPermlaneHazard,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen
@@ -1640,7 +1641,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
Feature1_5xVGPRs,
-   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<

>From 5a627920d5c77a3b1d9b9ec1ddef1aa31fa1cf09 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 11:30:51 +0200
Subject: [PATCH 3/3] Rename

---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 10 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 39a1d629a4aea..34c6f6ff19bff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -78

[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96444

>From 0381e27b091f0cb6558fb9b4bf3e5359655acab0 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 17:07:53 +0200
Subject: [PATCH] AMDGPU: Add subtarget feature for memory atomic fadd f64

---
 llvm/lib/Target/AMDGPU/AMDGPU.td   | 21 ++---
 llvm/lib/Target/AMDGPU/BUFInstructions.td  | 10 ++
 llvm/lib/Target/AMDGPU/FLATInstructions.td |  6 +++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h  | 10 +++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  2 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 34c6f6ff19bff..84ea040477763 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureFlatBufferGlobalAtomicFaddF64Inst
+  : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
+  "HasFlatBufferGlobalAtomicFaddF64Inst",
+  "true",
+  "Has flat, buffer, and global instructions for f64 atomic fadd"
+>;
+
 def FeatureMemoryAtomicFAddF32DenormalSupport
   : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
   "HasMemoryAtomicFaddF32DenormalSupport",
@@ -1390,7 +1397,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
  FeatureBackOffBarrier,
  FeatureKernargPreload,
  FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureFlatBufferGlobalAtomicFaddF64Inst
  ])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
@@ -1435,7 +1443,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
-   FeatureMemoryAtomicFAddF32DenormalSupport
+   FeatureMemoryAtomicFAddF32DenormalSupport,
+   FeatureFlatBufferGlobalAtomicFaddF64Inst
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1932,11 +1941,9 @@ def isGFX12Plus :
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
-
-def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
-  Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
-  // FIXME: This is too coarse, and working around using pseudo's predicates 
on real instruction.
-  AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, 
FeatureSouthernIslands, FeatureSeaIslands)>;
+def HasFlatBufferGlobalAtomicFaddF64Inst :
+  Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">,
+  AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>;
 
 def HasAtomicFMinFMaxF32GlobalInsts :
   Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 3b8d94b744000..a904c8483dbf5 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1312,14 +1312,16 @@ let SubtargetPredicate = isGFX90APlus in {
   }
 } // End SubtargetPredicate = isGFX90APlus
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", 
VReg_64, f64>;
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
   // depending on some subtargets.
   defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", 
VReg_64, f64>;
   defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", 
VReg_64, f64>;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+}
 
 def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
   let SubtargetPredicate = isGFX940Plus;
@@ -1836,9 +1838,9 @@ let SubtargetPredicate = 
HasAtomicBufferGlobalPkAddF16Insts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, 
"BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
 } // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, 
"BUFFER_ATOMIC_ADD_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
 let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_MIN_F64">;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 98054dde398b3..89946a4719557 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96872

Need to emit syncscope and new metadata to get the native instruction,
most of the time.

>From bd298a4cb7aaa7f287da0654c8a530e378f0362a Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 11 Jun 2024 10:58:44 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw for
 __builtin_amdgcn_global_atomic_fadd_{f32|f64}

Need to emit syncscope and new metadata to get the native instruction,
most of the time.
---
 clang/lib/CodeGen/CGBuiltin.cpp   | 39 +--
 .../CodeGenOpenCL/builtins-amdgcn-gfx11.cl|  2 +-
 .../builtins-fp-atomics-gfx12.cl  |  4 +-
 .../builtins-fp-atomics-gfx90a.cl |  4 +-
 .../builtins-fp-atomics-gfx940.cl |  4 +-
 5 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 96dcf6283f9f8..d90762748d925 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MatrixBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -18654,8 +18655,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18667,18 +18666,11 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   ArgTy = llvm::FixedVectorType::get(
   llvm::Type::getHalfTy(getLLVMContext()), 2);
   IID = Intrinsic::amdgcn_global_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -19091,7 +19083,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19107,6 +19101,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
@@ -19133,8 +19129,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
   EmitScalarExpr(E->getArg(3)), AO, SSID);
 } else {
-  // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
-  SSID = llvm::SyncScope::System;
+  // Most of the builtins do not have syncscope/order arguments. For DS
+  // atomics the scope doesn't really matter, as they implicitly operate at
+  // workgroup scope.
+  //
+  // The global/flat cases need to use agent scope to consistently produce
+  // the native instruction instead of a cmpxchg expansion.
+  SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
   AO = AtomicOrdering::SequentiallyConsistent;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19149,6 +19150,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
 if (Volatile)
   RMW->setVolatile(true);
+
+unsigned AddrSpace = Ptr.getType()->getAddressSpace();
+if (AddrSpace != llvm::AMDGPUAS::L

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from {global|flat}_atomic_fadd_v2f16 builtins (PR #96873)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96873

None

>From 65a690d80cf39df132cacff510371c9dcb1b97fd Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:12:59 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from
 {global|flat}_atomic_fadd_v2f16 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 20 ++-
 .../builtins-fp-atomics-gfx12.cl  |  9 ++---
 .../builtins-fp-atomics-gfx90a.cl |  2 +-
 .../builtins-fp-atomics-gfx940.cl |  3 ++-
 4 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d90762748d925..2a1861e4413fd 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18655,22 +18655,15 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -18690,11 +18683,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ArgTy = llvm::Type::getFloatTy(getLLVMContext());
   IID = Intrinsic::amdgcn_flat_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -19085,7 +19073,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19103,6 +19093,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 21c1c38bc78dc..cf304d7b0818a 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -48,7 +48,8 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_flat_add_2f16
-// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr 
%{{.*}}, <2 x half> %{{.*}})
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} 
syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX12-LABEL:  test_flat_add_2f16
 // GFX12: flat_atomic_pk_add_f16
 half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
@@ -64,7 +65,8 @@ short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
 }
 
 // CHECK-LABEL: test_global_add_half2
-// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr 
addrspace(1) %{{.*}}, <2 x half> %{{.*}})
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> 
%{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
+
 // GFX12-LABEL:  test_global_add_half2
 // GFX12:  global_atomic_pk_add_f16 v2, v[0:1], v2, off th

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64} builtins (PR #96874)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96874

None

>From 9347154207e5a8d75755b11813b870b207fd125a Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:15:26 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64}
 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp | 17 ++---
 .../CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl |  6 --
 .../CodeGenOpenCL/builtins-fp-atomics-gfx940.cl |  3 ++-
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2a1861e4413fd..54e363d6fd0e8 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18657,10 +18657,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   }
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
@@ -18670,19 +18668,12 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmax;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmin;
   break;
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmax;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -19075,7 +19066,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19095,6 +19088,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index 4980c44215743..60a3033a36c17 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -45,7 +45,8 @@ void test_global_max_f64(__global double *addr, double x){
 }
 
 // CHECK-LABEL: test_flat_add_local_f64
-// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr 
addrspace(3) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8{{$}}
+
 // GFX90A-LABEL:  test_flat_add_local_f64$local
 // GFX90A:  ds_add_rtn_f64
 void test_flat_add_local_f64(__local double *addr, double x){
@@ -54,7 +55,8 @@ void test_flat_add_local_f64(__local double *addr, double x){
 }
 
 // CHECK-LABEL: test_flat_global_add_f64
-// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_flat_global_add_f64$local
 // GFX90A:  global_atomic_add_f64
 void test_flat_global_add_f64(__global double *addr, double x){
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
index a4f438bea33a6..2618e2809fbbf 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
@@ -10,7 +10,8 @@ typedef half  __attribute__((ext_vector_type(2)))

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96872?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96873** https://app.graphite.dev/github/pr/llvm/llvm-project/96873?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96872** https://app.graphite.dev/github/pr/llvm/llvm-project/96872?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96760** https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96759** https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96872
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins (PR #96875)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96875

None

>From 94d04eb6576b811e11175ca36a340649a63bf007 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:34:43 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16
 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 26 ++-
 .../builtins-fp-atomics-gfx12.cl  | 18 ++---
 .../builtins-fp-atomics-gfx940.cl | 10 +--
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 54e363d6fd0e8..4bbb4375ee997 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18681,22 +18681,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
 return Builder.CreateCall(F, {Addr, Val});
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
-Intrinsic::ID IID;
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
-  IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19068,7 +19052,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19090,6 +19076,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
@@ -19126,7 +19114,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   AO = AtomicOrdering::SequentiallyConsistent;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
-  if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
+  if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
+  BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
+  BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
 llvm::Type *V2BF16Ty = FixedVectorType::get(
 llvm::Type::getBFloatTy(Builder.getContext()), 2);
 Val = Builder.CreateBitCast(Val, V2BF16Ty);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index cf304d7b0818a..8d6bb948b0a7a 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
 
 // CHECK-LABEL: test_local_add_2bf16
 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> 
[[BC0]] syncscope("agent") seq_cst, align 4
+// CHECK-NEXT: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x 
bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
 
 // GFX12-LABEL:  test_local_add_2bf16
@@ -57,7 +57,10 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_flat_add_2bf16
-// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, 
<2 x i16> %{{.*}})
+// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC]] 
syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
+
 // GFX12-LABEL:  test_flat_add_2bf16
 // GFX12: flat_atomic_pk_

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64} builtins (PR #96874)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96874?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96874** https://app.graphite.dev/github/pr/llvm/llvm-project/96874?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96873** https://app.graphite.dev/github/pr/llvm/llvm-project/96873?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96872** https://app.graphite.dev/github/pr/llvm/llvm-project/96872?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96760** https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96759** https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96874
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from {global|flat}_atomic_fadd_v2f16 builtins (PR #96873)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96873?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96874** https://app.graphite.dev/github/pr/llvm/llvm-project/96874?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96873** https://app.graphite.dev/github/pr/llvm/llvm-project/96873?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96872** https://app.graphite.dev/github/pr/llvm/llvm-project/96872?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96760** https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96759** https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96873
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max f64 builtins (PR #96876)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96876

None

>From 0e174a54c24c70343a0e28c6ca053ab4bbbae3d2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 23:18:32 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max
 f64 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 42 +++
 .../builtins-fp-atomics-gfx90a.cl | 18 +---
 2 files changed, 27 insertions(+), 33 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4bbb4375ee997..a3115bfa4d230 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18655,32 +18655,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
-Intrinsic::ID IID;
-llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmax;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmax;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F =
-CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19054,7 +19028,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19080,6 +19058,16 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
+case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
+  BinOp = llvm::AtomicRMWInst::FMin;
+  break;
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
+case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+  BinOp = llvm::AtomicRMWInst::FMax;
+  break;
 }
 
 Address Ptr = CheckAtomicAlignment(*this, E);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index 60a3033a36c17..cfc5adc57bf5e 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -27,7 +27,8 @@ void test_global_add_half2(__global half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_global_global_min_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmin ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_global_min_f64$local
 // GFX90A:  global_atomic_min_f64
 void test_global_global_min_f64(__global double *addr, double x){
@@ -36,7 +37,8 @@ void test_global_global_min_f64(__global double *addr, double 
x){
 }
 
 // CHECK-LABEL: test_global_max_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmax ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_max_f64$local
 // GFX90A:  global_atomic_max_f64
 void tes

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins (PR #96875)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96875?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96876** https://app.graphite.dev/github/pr/llvm/llvm-project/96876?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96875** https://app.graphite.dev/github/pr/llvm/llvm-project/96875?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96874** https://app.graphite.dev/github/pr/llvm/llvm-project/96874?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96873** https://app.graphite.dev/github/pr/llvm/llvm-project/96873?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96872** https://app.graphite.dev/github/pr/llvm/llvm-project/96872?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96760** https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96759** https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96875
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max f64 builtins (PR #96876)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96876?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96876** https://app.graphite.dev/github/pr/llvm/llvm-project/96876?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96875** https://app.graphite.dev/github/pr/llvm/llvm-project/96875?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96874** https://app.graphite.dev/github/pr/llvm/llvm-project/96874?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96873** https://app.graphite.dev/github/pr/llvm/llvm-project/96873?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96872** https://app.graphite.dev/github/pr/llvm/llvm-project/96872?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96760** https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96759** https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96876
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/96872
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from {global|flat}_atomic_fadd_v2f16 builtins (PR #96873)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/96873
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64} builtins (PR #96874)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/96874
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from {global|flat}_atomic_fadd_v2f16 builtins (PR #96873)

2024-06-27 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-globalisel

Author: Matt Arsenault (arsenm)


Changes



---
Full diff: https://github.com/llvm/llvm-project/pull/96873.diff


4 Files Affected:

- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+6-14) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl (+6-3) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl (+1-1) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl (+2-1) 


``diff
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d90762748d925..2a1861e4413fd 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18655,22 +18655,15 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -18690,11 +18683,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ArgTy = llvm::Type::getFloatTy(getLLVMContext());
   IID = Intrinsic::amdgcn_flat_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -19085,7 +19073,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19103,6 +19093,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 21c1c38bc78dc..cf304d7b0818a 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -48,7 +48,8 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_flat_add_2f16
-// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr 
%{{.*}}, <2 x half> %{{.*}})
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} 
syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX12-LABEL:  test_flat_add_2f16
 // GFX12: flat_atomic_pk_add_f16
 half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
@@ -64,7 +65,8 @@ short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
 }
 
 // CHECK-LABEL: test_global_add_half2
-// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr 
addrspace(1) %{{.*}}, <2 x half> %{{.*}})
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> 
%{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
+
 // GFX12-LABEL:  test_global_add_half2
 // GFX12:  global_atomic_pk_add_f16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
 void test_global_add_half2(__global half2 *addr, half2 x) {
@@ -73,7 +75,8 @@ void test_global_add_half2(__glo

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins (PR #96875)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/96875
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64} builtins (PR #96874)

2024-06-27 Thread via llvm-branch-commits

llvmbot wrote:



@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes



---
Full diff: https://github.com/llvm/llvm-project/pull/96874.diff


3 Files Affected:

- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+6-11) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl (+4-2) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl (+2-1) 


``diff
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2a1861e4413fd..54e363d6fd0e8 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18657,10 +18657,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   }
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
@@ -18670,19 +18668,12 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmax;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmin;
   break;
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmax;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -19075,7 +19066,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19095,6 +19088,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index 4980c44215743..60a3033a36c17 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -45,7 +45,8 @@ void test_global_max_f64(__global double *addr, double x){
 }
 
 // CHECK-LABEL: test_flat_add_local_f64
-// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr 
addrspace(3) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8{{$}}
+
 // GFX90A-LABEL:  test_flat_add_local_f64$local
 // GFX90A:  ds_add_rtn_f64
 void test_flat_add_local_f64(__local double *addr, double x){
@@ -54,7 +55,8 @@ void test_flat_add_local_f64(__local double *addr, double x){
 }
 
 // CHECK-LABEL: test_flat_global_add_f64
-// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_flat_global_add_f64$local
 // GFX90A:  global_atomic_add_f64
 void test_flat_global_add_f64(__global double *addr, double x){
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
index a4f438bea33a6..2618e2809fbbf 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
@@ -10,7 +10,8 @@ typedef half  __attribute__((ext_vector_type(2))) half2;
 typedef short __attribute__((ext_vector_type(2))) short2;
 
 // CHECK-LABEL: test_flat_add_f32

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max f64 builtins (PR #96876)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/96876
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins (PR #96875)

2024-06-27 Thread via llvm-branch-commits

llvmbot wrote:



@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes



---
Full diff: https://github.com/llvm/llvm-project/pull/96875.diff


3 Files Affected:

- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+8-18) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl (+14-4) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl (+8-2) 


``diff
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 54e363d6fd0e8..4bbb4375ee997 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18681,22 +18681,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
 return Builder.CreateCall(F, {Addr, Val});
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
-Intrinsic::ID IID;
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
-  IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19068,7 +19052,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19090,6 +19076,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
@@ -19126,7 +19114,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   AO = AtomicOrdering::SequentiallyConsistent;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
-  if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
+  if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
+  BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
+  BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
 llvm::Type *V2BF16Ty = FixedVectorType::get(
 llvm::Type::getBFloatTy(Builder.getContext()), 2);
 Val = Builder.CreateBitCast(Val, V2BF16Ty);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index cf304d7b0818a..8d6bb948b0a7a 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
 
 // CHECK-LABEL: test_local_add_2bf16
 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> 
[[BC0]] syncscope("agent") seq_cst, align 4
+// CHECK-NEXT: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x 
bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
 
 // GFX12-LABEL:  test_local_add_2bf16
@@ -57,7 +57,10 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_flat_add_2bf16
-// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, 
<2 x i16> %{{.*}})
+// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC]] 
syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
+
 // GFX12-LABEL:  test_flat_add_2bf16
 // GFX12: flat_atomic_pk_add_bf16
 short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
@@ -84,7 +87,11 @@ void test_global_add

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max f64 builtins (PR #96876)

2024-06-27 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes



---
Full diff: https://github.com/llvm/llvm-project/pull/96876.diff


2 Files Affected:

- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+15-27) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl (+12-6) 


``diff
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4bbb4375ee997..a3115bfa4d230 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18655,32 +18655,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
-Intrinsic::ID IID;
-llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmax;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmax;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F =
-CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19054,7 +19028,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19080,6 +19058,16 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
+case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
+  BinOp = llvm::AtomicRMWInst::FMin;
+  break;
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
+case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+  BinOp = llvm::AtomicRMWInst::FMax;
+  break;
 }
 
 Address Ptr = CheckAtomicAlignment(*this, E);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index 60a3033a36c17..cfc5adc57bf5e 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -27,7 +27,8 @@ void test_global_add_half2(__global half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_global_global_min_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmin ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_global_min_f64$local
 // GFX90A:  global_atomic_min_f64
 void test_global_global_min_f64(__global double *addr, double x){
@@ -36,7 +37,8 @@ void test_global_global_min_f64(__global double *addr, double 
x){
 }
 
 // CHECK-LABEL: test_global_max_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmax ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_max_f64$local
 // GFX90A:  global_atomic_max_f64
 void test_global_max_f64(__global double *addr, double x){
@@ -65,7 +67,8 @@ void test_flat_global_add_f64(__global double *addr, double 
x){
 }
 
 // CHECK-LABEL: test_flat

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max f64 builtins (PR #96876)

2024-06-27 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-globalisel

Author: Matt Arsenault (arsenm)


Changes



---
Full diff: https://github.com/llvm/llvm-project/pull/96876.diff


2 Files Affected:

- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+15-27) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl (+12-6) 


``diff
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4bbb4375ee997..a3115bfa4d230 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18655,32 +18655,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
-Intrinsic::ID IID;
-llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmax;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmax;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F =
-CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19054,7 +19028,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19080,6 +19058,16 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
+case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
+  BinOp = llvm::AtomicRMWInst::FMin;
+  break;
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
+case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+  BinOp = llvm::AtomicRMWInst::FMax;
+  break;
 }
 
 Address Ptr = CheckAtomicAlignment(*this, E);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index 60a3033a36c17..cfc5adc57bf5e 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -27,7 +27,8 @@ void test_global_add_half2(__global half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_global_global_min_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmin ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_global_min_f64$local
 // GFX90A:  global_atomic_min_f64
 void test_global_global_min_f64(__global double *addr, double x){
@@ -36,7 +37,8 @@ void test_global_global_min_f64(__global double *addr, double 
x){
 }
 
 // CHECK-LABEL: test_global_max_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmax ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_max_f64$local
 // GFX90A:  global_atomic_max_f64
 void test_global_max_f64(__global double *addr, double x){
@@ -65,7 +67,8 @@ void test_flat_global_add_f64(__global double *addr, double 
x){
 }
 
 // CHECK-LABEL: test_fla

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-06-27 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes

Need to emit syncscope and new metadata to get the native instruction,
most of the time.

---
Full diff: https://github.com/llvm/llvm-project/pull/96872.diff


5 Files Affected:

- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+27-12) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl (+1-1) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl (+2-2) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl (+2-2) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl (+2-2) 


``diff
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 96dcf6283f9f8..d90762748d925 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MatrixBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -18654,8 +18655,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18667,18 +18666,11 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   ArgTy = llvm::FixedVectorType::get(
   llvm::Type::getHalfTy(getLLVMContext()), 2);
   IID = Intrinsic::amdgcn_global_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -19091,7 +19083,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19107,6 +19101,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
@@ -19133,8 +19129,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
   EmitScalarExpr(E->getArg(3)), AO, SSID);
 } else {
-  // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
-  SSID = llvm::SyncScope::System;
+  // Most of the builtins do not have syncscope/order arguments. For DS
+  // atomics the scope doesn't really matter, as they implicitly operate at
+  // workgroup scope.
+  //
+  // The global/flat cases need to use agent scope to consistently produce
+  // the native instruction instead of a cmpxchg expansion.
+  SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
   AO = AtomicOrdering::SequentiallyConsistent;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19149,6 +19150,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
 if (Volatile)
   RMW->setVolatile(true);
+
+unsigned AddrSpace = Ptr.getType()->getAddressSpace();
+if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
+  // Most targets require "amdgpu.no.fine.grained.memory" to emit the 
native
+  // instruction for flat and global operations.
+  llvm::MDTuple *EmptyMD = MDNode::g

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-06-27 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Matt Arsenault (arsenm)


Changes

Need to emit syncscope and new metadata to get the native instruction,
most of the time.

---
Full diff: https://github.com/llvm/llvm-project/pull/96872.diff


5 Files Affected:

- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+27-12) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl (+1-1) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl (+2-2) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl (+2-2) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl (+2-2) 


``diff
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 96dcf6283f9f8..d90762748d925 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MatrixBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -18654,8 +18655,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18667,18 +18666,11 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   ArgTy = llvm::FixedVectorType::get(
   llvm::Type::getHalfTy(getLLVMContext()), 2);
   IID = Intrinsic::amdgcn_global_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -19091,7 +19083,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19107,6 +19101,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
@@ -19133,8 +19129,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
   EmitScalarExpr(E->getArg(3)), AO, SSID);
 } else {
-  // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
-  SSID = llvm::SyncScope::System;
+  // Most of the builtins do not have syncscope/order arguments. For DS
+  // atomics the scope doesn't really matter, as they implicitly operate at
+  // workgroup scope.
+  //
+  // The global/flat cases need to use agent scope to consistently produce
+  // the native instruction instead of a cmpxchg expansion.
+  SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
   AO = AtomicOrdering::SequentiallyConsistent;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19149,6 +19150,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
 if (Volatile)
   RMW->setVolatile(true);
+
+unsigned AddrSpace = Ptr.getType()->getAddressSpace();
+if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
+  // Most targets require "amdgpu.no.fine.grained.memory" to emit the 
native
+  // instruction for flat and global operations.
+  llvm::MDTuple *EmptyMD = MDNode::

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-06-27 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-globalisel

Author: Matt Arsenault (arsenm)


Changes

Need to emit syncscope and new metadata to get the native instruction,
most of the time.

---
Full diff: https://github.com/llvm/llvm-project/pull/96872.diff


5 Files Affected:

- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+27-12) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl (+1-1) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl (+2-2) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl (+2-2) 
- (modified) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl (+2-2) 


``diff
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 96dcf6283f9f8..d90762748d925 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MatrixBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -18654,8 +18655,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18667,18 +18666,11 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   ArgTy = llvm::FixedVectorType::get(
   llvm::Type::getHalfTy(getLLVMContext()), 2);
   IID = Intrinsic::amdgcn_global_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -19091,7 +19083,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19107,6 +19101,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
@@ -19133,8 +19129,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
   EmitScalarExpr(E->getArg(3)), AO, SSID);
 } else {
-  // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
-  SSID = llvm::SyncScope::System;
+  // Most of the builtins do not have syncscope/order arguments. For DS
+  // atomics the scope doesn't really matter, as they implicitly operate at
+  // workgroup scope.
+  //
+  // The global/flat cases need to use agent scope to consistently produce
+  // the native instruction instead of a cmpxchg expansion.
+  SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
   AO = AtomicOrdering::SequentiallyConsistent;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19149,6 +19150,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
 if (Volatile)
   RMW->setVolatile(true);
+
+unsigned AddrSpace = Ptr.getType()->getAddressSpace();
+if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
+  // Most targets require "amdgpu.no.fine.grained.memory" to emit the 
native
+  // instruction for flat and global operations.
+  llvm::MDTuple *EmptyMD = MDNode::

[llvm-branch-commits] [clang] [Clang] Extend lifetime bound analysis to support assignments (PR #96475)

2024-06-27 Thread Haojian Wu via llvm-branch-commits

https://github.com/hokein updated 
https://github.com/llvm/llvm-project/pull/96475

error: too big or took too long to generate
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] [TySan] Fixed false positive when accessing offset member variables (PR #95387)

2024-06-27 Thread via llvm-branch-commits

https://github.com/gbMattN ready_for_review 
https://github.com/llvm/llvm-project/pull/95387
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-06-27 Thread Yaxun Liu via llvm-branch-commits


@@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() {
 }
 
 // CHECK-LABEL: @test_global_add_f32
-// CHECK: {{.*}}call{{.*}} float 
@llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float 
%{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") 
seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, 
!amdgpu.ignore.denormal.mode !{{[0-9]+$}}

yxsamliu wrote:

why the memory order is seq_cst ? Does this generate the same ISA as before? 
Can we add some test to emit assembly directly by clang to make sure the ISA 
does not change? 

https://github.com/llvm/llvm-project/pull/96872
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits


@@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() {
 }
 
 // CHECK-LABEL: @test_global_add_f32
-// CHECK: {{.*}}call{{.*}} float 
@llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float 
%{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") 
seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, 
!amdgpu.ignore.denormal.mode !{{[0-9]+$}}

arsenm wrote:

That's the most conservative option. The current intrinsic handling isn't 
treated as an atomic at all, and the lowering adds a volatile flag instead. 
With seq_cst you end up with an additional cache flush compared to the current 
intrinsic. Release seems to be the strongest ordering that doesn't introduce a 
new flush after 

Running codegen and checking ISA is generally discouraged in clang tests 

https://github.com/llvm/llvm-project/pull/96872
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [Flang][OpenMP] Update flang with changes to the OpenMP dialect (PR #92524)

2024-06-27 Thread Kareem Ergawy via llvm-branch-commits

https://github.com/ergawy approved this pull request.


https://github.com/llvm/llvm-project/pull/92524
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Remove ds_fmin/ds_fmax intrinsics (PR #96739)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Jun 27, 9:27 AM EDT**: @arsenm started a stack merge that includes this 
pull request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/96739).


https://github.com/llvm/llvm-project/pull/96739
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96443

>From 5336548933c1ebd9a9e69938085a42d4ecac1511 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 16:44:08 +0200
Subject: [PATCH 1/3] AMDGPU: Add subtarget feature for global atomic fadd
 denormal support

Not sure what the behavior for gfx90a is. The SPG says it always flushes.
The instruction documentation says it does not.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 14 --
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  7 +++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 56ec5e9c4cfc2..6b212e1b2af03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureMemoryAtomicFaddF32DenormalSupport
+  : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
+  "HasAtomicMemoryAtomicFaddF32DenormalSupport",
+  "true",
+  "global/flat/buffer atomic fadd for float supports denormal handling"
+>;
+
 def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
   "HasAgentScopeFineGrainedRemoteMemoryAtomics",
@@ -1427,7 +1434,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
-   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1631,7 +1639,9 @@ def FeatureISAVersion12 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
-   Feature1_5xVGPRs]>;
+   Feature1_5xVGPRs,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   ]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 9e2a316a9ed28..db0b2b67a0388 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicFlatPkAdd16Insts = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
+  bool HasAtomicMemoryAtomicFaddF32DenormalSupport = false;
   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
@@ -872,6 +873,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if the target's flat, global, and buffer atomic fadd for
+  /// float supports denormal handling.
+  bool hasMemoryAtomicFaddF32DenormalSupport() const {
+return HasAtomicMemoryAtomicFaddF32DenormalSupport;
+  }
+
   /// \return true if atomic operations targeting fine-grained memory work
   /// correctly at device scope, in allocations in host or peer PCIe device
   /// memory.

>From 5b5b5666f9293930a5dd3a8e65d9838d32c8d68f Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 24 Jun 2024 12:10:37 +0200
Subject: [PATCH 2/3] Add to gfx11.

RDNA 3 manual says "Floating-point addition handles NAN/INF/denorm"
thought I'm not sure I trust it.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 6b212e1b2af03..39a1d629a4aea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1547,7 +1547,8 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeaturePackedTID,
-   FeatureVcmpxPermlaneHazard]>;
+   FeatureVcmpxPermlaneHazard,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen
@@ -1640,7 +1641,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
Feature1_5xVGPRs,
-   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<

>From 1e3c134b245dedc2996cbf45a8cf49d109cd0772 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 11:30:51 +0200
Subject: [PATCH 3/3] Rename

---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 10 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 39a1d629a4aea..34c6f6ff19bff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -78

[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96444

>From 234b772ad9a5f5a430da538474edcc968233f2ad Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 17:07:53 +0200
Subject: [PATCH] AMDGPU: Add subtarget feature for memory atomic fadd f64

---
 llvm/lib/Target/AMDGPU/AMDGPU.td   | 21 ++---
 llvm/lib/Target/AMDGPU/BUFInstructions.td  | 10 ++
 llvm/lib/Target/AMDGPU/FLATInstructions.td |  6 +++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h  | 10 +++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  2 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 34c6f6ff19bff..84ea040477763 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureFlatBufferGlobalAtomicFaddF64Inst
+  : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
+  "HasFlatBufferGlobalAtomicFaddF64Inst",
+  "true",
+  "Has flat, buffer, and global instructions for f64 atomic fadd"
+>;
+
 def FeatureMemoryAtomicFAddF32DenormalSupport
   : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
   "HasMemoryAtomicFaddF32DenormalSupport",
@@ -1390,7 +1397,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
  FeatureBackOffBarrier,
  FeatureKernargPreload,
  FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureFlatBufferGlobalAtomicFaddF64Inst
  ])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
@@ -1435,7 +1443,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
-   FeatureMemoryAtomicFAddF32DenormalSupport
+   FeatureMemoryAtomicFAddF32DenormalSupport,
+   FeatureFlatBufferGlobalAtomicFaddF64Inst
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1932,11 +1941,9 @@ def isGFX12Plus :
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
-
-def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
-  Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
-  // FIXME: This is too coarse, and working around using pseudo's predicates 
on real instruction.
-  AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, 
FeatureSouthernIslands, FeatureSeaIslands)>;
+def HasFlatBufferGlobalAtomicFaddF64Inst :
+  Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">,
+  AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>;
 
 def HasAtomicFMinFMaxF32GlobalInsts :
   Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 3b8d94b744000..a904c8483dbf5 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1312,14 +1312,16 @@ let SubtargetPredicate = isGFX90APlus in {
   }
 } // End SubtargetPredicate = isGFX90APlus
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", 
VReg_64, f64>;
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
   // depending on some subtargets.
   defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", 
VReg_64, f64>;
   defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", 
VReg_64, f64>;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+}
 
 def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
   let SubtargetPredicate = isGFX940Plus;
@@ -1836,9 +1838,9 @@ let SubtargetPredicate = 
HasAtomicBufferGlobalPkAddF16Insts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, 
"BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
 } // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, 
"BUFFER_ATOMIC_ADD_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
 let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_MIN_F64">;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 98054dde398b3..89946a4719557 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96872

>From 0f902e34937041d5171ad9e642724ec4594f601d Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 11 Jun 2024 10:58:44 +0200
Subject: [PATCH 1/2] clang/AMDGPU: Emit atomicrmw for
 __builtin_amdgcn_global_atomic_fadd_{f32|f64}

Need to emit syncscope and new metadata to get the native instruction,
most of the time.
---
 clang/lib/CodeGen/CGBuiltin.cpp   | 39 +--
 .../CodeGenOpenCL/builtins-amdgcn-gfx11.cl|  2 +-
 .../builtins-fp-atomics-gfx12.cl  |  4 +-
 .../builtins-fp-atomics-gfx90a.cl |  4 +-
 .../builtins-fp-atomics-gfx940.cl |  4 +-
 5 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 98c2f70664ec7..382812aeecf7f 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MatrixBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -18632,8 +18633,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18645,18 +18644,11 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   ArgTy = llvm::FixedVectorType::get(
   llvm::Type::getHalfTy(getLLVMContext()), 2);
   IID = Intrinsic::amdgcn_global_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -19071,7 +19063,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
   case AMDGPU::BI__builtin_amdgcn_ds_faddf:
   case AMDGPU::BI__builtin_amdgcn_ds_fminf:
-  case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
+  case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19087,6 +19081,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19121,8 +19117,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
   EmitScalarExpr(E->getArg(3)), AO, SSID);
 } else {
-  // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
-  SSID = llvm::SyncScope::System;
+  // Most of the builtins do not have syncscope/order arguments. For DS
+  // atomics the scope doesn't really matter, as they implicitly operate at
+  // workgroup scope.
+  //
+  // The global/flat cases need to use agent scope to consistently produce
+  // the native instruction instead of a cmpxchg expansion.
+  SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
   AO = AtomicOrdering::SequentiallyConsistent;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19137,6 +19138,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
 if (Volatile)
   RMW->setVolatile(true);
+
+unsigned AddrSpace = Ptr.getType()->getAddressSpace();
+if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
+  // Most targets require "amdgpu.no.fine.grained.memory" to emit the 
nativ

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from {global|flat}_atomic_fadd_v2f16 builtins (PR #96873)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96873

>From f9fe227fedd64aab31d62e04f63cc1b709ce4d7f Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:12:59 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from
 {global|flat}_atomic_fadd_v2f16 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 20 ++-
 .../builtins-fp-atomics-gfx12.cl  |  9 ++---
 .../builtins-fp-atomics-gfx90a.cl |  2 +-
 .../builtins-fp-atomics-gfx940.cl |  3 ++-
 4 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3015a5de4e690..4729f81a48707 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18633,22 +18633,15 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -18668,11 +18661,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ArgTy = llvm::Type::getFloatTy(getLLVMContext());
   IID = Intrinsic::amdgcn_flat_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -19065,7 +19053,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_fminf:
   case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19083,6 +19073,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 6b8a6d14575db..07e63a8711c7f 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -48,7 +48,8 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_flat_add_2f16
-// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr 
%{{.*}}, <2 x half> %{{.*}})
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} 
syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX12-LABEL:  test_flat_add_2f16
 // GFX12: flat_atomic_pk_add_f16
 half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
@@ -64,7 +65,8 @@ short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
 }
 
 // CHECK-LABEL: test_global_add_half2
-// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr 
addrspace(1) %{{.*}}, <2 x half> %{{.*}})
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> 
%{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
+
 // GFX12-LABEL:  test_global_add_half2
 // GFX12:  global_atomic_pk_add_f16 v2, v[0:1], v2, off

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64} builtins (PR #96874)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96874

>From 5df69e0bb808e808c0638a95ed4d14f135b88a09 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:15:26 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64}
 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp | 17 ++---
 .../CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl |  6 --
 .../CodeGenOpenCL/builtins-fp-atomics-gfx940.cl |  3 ++-
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4729f81a48707..6354051c77ebd 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18635,10 +18635,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   }
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
@@ -18648,19 +18646,12 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmax;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmin;
   break;
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmax;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -19055,7 +19046,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19075,6 +19068,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index cd10777dbe079..02e289427238f 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -45,7 +45,8 @@ void test_global_max_f64(__global double *addr, double x){
 }
 
 // CHECK-LABEL: test_flat_add_local_f64
-// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr 
addrspace(3) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8{{$}}
+
 // GFX90A-LABEL:  test_flat_add_local_f64$local
 // GFX90A:  ds_add_rtn_f64
 void test_flat_add_local_f64(__local double *addr, double x){
@@ -54,7 +55,8 @@ void test_flat_add_local_f64(__local double *addr, double x){
 }
 
 // CHECK-LABEL: test_flat_global_add_f64
-// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_flat_global_add_f64$local
 // GFX90A:  global_atomic_add_f64
 void test_flat_global_add_f64(__global double *addr, double x){
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
index 589dcd406630d..bd9b8c7268e06 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
@@ -10,7 +10,8 @@ typedef half  _

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins (PR #96875)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96875

>From 39ecce1f9c4b668761b78fe3c901b9200fed43f7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:34:43 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16
 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 26 ++-
 .../builtins-fp-atomics-gfx12.cl  | 24 -
 .../builtins-fp-atomics-gfx90a.cl |  6 ++---
 .../builtins-fp-atomics-gfx940.cl | 14 +++---
 4 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6354051c77ebd..e5a9c715c8a07 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18659,22 +18659,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
 return Builder.CreateCall(F, {Addr, Val});
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
-Intrinsic::ID IID;
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
-  IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19048,7 +19032,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19070,6 +19056,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19114,7 +19102,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   AO = AtomicOrdering::Monotonic;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
-  if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
+  if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
+  BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
+  BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
 llvm::Type *V2BF16Ty = FixedVectorType::get(
 llvm::Type::getBFloatTy(Builder.getContext()), 2);
 Val = Builder.CreateBitCast(Val, V2BF16Ty);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 07e63a8711c7f..e8b6eb57c38d7 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
 
 // CHECK-LABEL: test_local_add_2bf16
 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> 
[[BC0]] syncscope("agent") monotonic, align 4
+// CHECK-NEXT: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x 
bfloat> [[BC0]] syncscope("agent") monotonic, align 4
 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
 
 // GFX12-LABEL:  test_local_add_2bf16
@@ -48,7 +48,7 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_flat_add_2f16
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} 
syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} 
syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
 
 // GFX12-LABEL:  test_flat_add_2f

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max f64 builtins (PR #96876)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96876

>From 4c2c159a6c3d4d7f509947bed2dc7873180565dd Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 23:18:32 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max
 f64 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 36 +--
 .../builtins-fp-atomics-gfx90a.cl | 18 ++
 2 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e5a9c715c8a07..e925b02ca110a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18633,32 +18633,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
-Intrinsic::ID IID;
-llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmax;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmax;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F =
-CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19034,7 +19008,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19061,8 +19039,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   BinOp = llvm::AtomicRMWInst::FMin;
   break;
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
   BinOp = llvm::AtomicRMWInst::FMax;
   break;
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index 9381ce951df3e..556e553903d1a 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -27,7 +27,8 @@ void test_global_add_half2(__global half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_global_global_min_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmin ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_global_min_f64$local
 // GFX90A:  global_atomic_min_f64
 void test_global_global_min_f64(__global double *addr, double x){
@@ -36,7 +37,8 @@ void test_global_global_min_f64(__global double *addr, double 
x){
 }
 
 // CHECK-LABEL: test_global_max_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmax ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_max_f64$local
 // GFX90A:  global_atomic_max_f64
 void test_global_max_f64(__global double *addr, double x){
@@ -65,7 +67,8 @@ void test_flat_global_add_f64(__global double *addr, doub

[llvm-branch-commits] [compiler-rt] [TySan] Fixed false positive when accessing offset member variables (PR #95387)

2024-06-27 Thread Florian Hahn via llvm-branch-commits


@@ -221,7 +221,17 @@ __tysan_check(void *addr, int size, tysan_type_descriptor 
*td, int flags) {
 OldTDPtr -= i;
 OldTD = *OldTDPtr;
 
-if (!isAliasingLegal(td, OldTD))
+tysan_type_descriptor *InternalMember = OldTD;

fhahn wrote:

Could you add a comment here indicating what this does?

https://github.com/llvm/llvm-project/pull/95387
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] [TySan] Fixed false positive when accessing offset member variables (PR #95387)

2024-06-27 Thread via llvm-branch-commits

https://github.com/gbMattN updated 
https://github.com/llvm/llvm-project/pull/95387

>From 8b9530d2efd2e8474fbeb1b788dd642d116fbc1d Mon Sep 17 00:00:00 2001
From: Matthew Nagy 
Date: Thu, 13 Jun 2024 09:54:04 +
Subject: [PATCH] [TySan] Fixed false positive when accessing offset member
 variables

---
 compiler-rt/lib/tysan/tysan.cpp | 19 ++-
 compiler-rt/test/tysan/struct-members.c | 31 +
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 compiler-rt/test/tysan/struct-members.c

diff --git a/compiler-rt/lib/tysan/tysan.cpp b/compiler-rt/lib/tysan/tysan.cpp
index f627851d049e6..8235b0ec2b55e 100644
--- a/compiler-rt/lib/tysan/tysan.cpp
+++ b/compiler-rt/lib/tysan/tysan.cpp
@@ -221,7 +221,24 @@ __tysan_check(void *addr, int size, tysan_type_descriptor 
*td, int flags) {
 OldTDPtr -= i;
 OldTD = *OldTDPtr;
 
-if (!isAliasingLegal(td, OldTD))
+// When shadow memory is set for global objects, the entire object is 
tagged with the struct type
+// This means that when you access a member variable, tysan reads that as 
you accessing a struct midway
+// through, with 'i' being the offset
+// Therefore, if you are accessing a struct, we need to find the member 
type. We can go through the
+// members of the struct type and see if there is a member at the offset 
you are accessing the struct by.
+// If there is indeed a member starting at offset 'i' in the struct, we 
should check aliasing legality
+// with that type. If there isn't, we run alias checking on the struct 
with will give us the correct error.
+tysan_type_descriptor *InternalMember = OldTD;
+if (OldTD->Tag == TYSAN_STRUCT_TD) {
+  for (int j = 0; j < OldTD->Struct.MemberCount; j++) {
+if (OldTD->Struct.Members[j].Offset == i) {
+  InternalMember = OldTD->Struct.Members[j].Type;
+  break;
+}
+  }
+}
+
+if (!isAliasingLegal(td, InternalMember))
   reportError(addr, size, td, OldTD, AccessStr,
   "accesses part of an existing object", -i, pc, bp, sp);
 
diff --git a/compiler-rt/test/tysan/struct-members.c 
b/compiler-rt/test/tysan/struct-members.c
new file mode 100644
index 0..76ea3c431dd7b
--- /dev/null
+++ b/compiler-rt/test/tysan/struct-members.c
@@ -0,0 +1,31 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include 
+
+struct X {
+  int a, b, c;
+} x;
+
+static struct X xArray[2];
+
+int main() {
+  x.a = 1;
+  x.b = 2;
+  x.c = 3;
+
+  printf("%d %d %d\n", x.a, x.b, x.c);
+  // CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
+
+  for (size_t i = 0; i < 2; i++) {
+xArray[i].a = 1;
+xArray[i].b = 1;
+xArray[i].c = 1;
+  }
+
+  struct X *xPtr = (struct X *)&(xArray[0].c);
+  xPtr->a = 1;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type int (in X at offset 0) 
accesses an existing object of type int (in X at offset 8)
+  // CHECK: {{#0 0x.* in main .*struct-members.c:}}[[@LINE-3]]
+}

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] [TySan] Fixed false positive when accessing offset member variables (PR #95387)

2024-06-27 Thread via llvm-branch-commits


@@ -221,7 +221,17 @@ __tysan_check(void *addr, int size, tysan_type_descriptor 
*td, int flags) {
 OldTDPtr -= i;
 OldTD = *OldTDPtr;
 
-if (!isAliasingLegal(td, OldTD))
+tysan_type_descriptor *InternalMember = OldTD;

gbMattN wrote:

Done!

https://github.com/llvm/llvm-project/pull/95387
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] [TySan] Fixed false positive when accessing offset member variables (PR #95387)

2024-06-27 Thread via llvm-branch-commits

https://github.com/gbMattN edited 
https://github.com/llvm/llvm-project/pull/95387
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [TySan] A Type Sanitizer (Clang) (PR #76260)

2024-06-27 Thread Florian Hahn via llvm-branch-commits

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/76260

>From f45d4dc65537f3664472c873062fbda2a9bed984 Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Thu, 18 Apr 2024 23:01:03 +0100
Subject: [PATCH 1/2] [TySan] A Type Sanitizer (Clang)

---
 clang/include/clang/Basic/Features.def |  1 +
 clang/include/clang/Basic/Sanitizers.def   |  3 ++
 clang/include/clang/Driver/SanitizerArgs.h |  1 +
 clang/lib/CodeGen/BackendUtil.cpp  |  6 +++
 clang/lib/CodeGen/CGDecl.cpp   |  3 +-
 clang/lib/CodeGen/CGDeclCXX.cpp|  4 ++
 clang/lib/CodeGen/CodeGenFunction.cpp  |  2 +
 clang/lib/CodeGen/CodeGenModule.cpp| 12 +++---
 clang/lib/CodeGen/CodeGenTBAA.cpp  |  6 ++-
 clang/lib/CodeGen/SanitizerMetadata.cpp| 44 +-
 clang/lib/CodeGen/SanitizerMetadata.h  | 13 ---
 clang/lib/Driver/SanitizerArgs.cpp | 13 +--
 clang/lib/Driver/ToolChains/CommonArgs.cpp |  6 ++-
 clang/lib/Driver/ToolChains/Darwin.cpp |  6 +++
 clang/lib/Driver/ToolChains/Linux.cpp  |  2 +
 clang/test/Driver/sanitizer-ld.c   | 23 +++
 16 files changed, 116 insertions(+), 29 deletions(-)

diff --git a/clang/include/clang/Basic/Features.def 
b/clang/include/clang/Basic/Features.def
index 53f410d3cb4bd..6a9921ffee884 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -100,6 +100,7 @@ FEATURE(numerical_stability_sanitizer, 
LangOpts.Sanitize.has(SanitizerKind::Nume
 FEATURE(memory_sanitizer,
 LangOpts.Sanitize.hasOneOf(SanitizerKind::Memory |
SanitizerKind::KernelMemory))
+FEATURE(type_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Type))
 FEATURE(thread_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Thread))
 FEATURE(dataflow_sanitizer, LangOpts.Sanitize.has(SanitizerKind::DataFlow))
 FEATURE(scudo, LangOpts.Sanitize.hasOneOf(SanitizerKind::Scudo))
diff --git a/clang/include/clang/Basic/Sanitizers.def 
b/clang/include/clang/Basic/Sanitizers.def
index bee35e9dca7c3..4b59b43437c2c 100644
--- a/clang/include/clang/Basic/Sanitizers.def
+++ b/clang/include/clang/Basic/Sanitizers.def
@@ -73,6 +73,9 @@ SANITIZER("fuzzer", Fuzzer)
 // libFuzzer-required instrumentation, no linking.
 SANITIZER("fuzzer-no-link", FuzzerNoLink)
 
+// TypeSanitizer
+SANITIZER("type", Type)
+
 // ThreadSanitizer
 SANITIZER("thread", Thread)
 
diff --git a/clang/include/clang/Driver/SanitizerArgs.h 
b/clang/include/clang/Driver/SanitizerArgs.h
index 47ef175302679..fde2ea3eac8ea 100644
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -86,6 +86,7 @@ class SanitizerArgs {
   bool needsHwasanAliasesRt() const {
 return needsHwasanRt() && HwasanUseAliases;
   }
+  bool needsTysanRt() const { return Sanitizers.has(SanitizerKind::Type); }
   bool needsTsanRt() const { return Sanitizers.has(SanitizerKind::Thread); }
   bool needsMsanRt() const { return Sanitizers.has(SanitizerKind::Memory); }
   bool needsFuzzer() const { return Sanitizers.has(SanitizerKind::Fuzzer); }
diff --git a/clang/lib/CodeGen/BackendUtil.cpp 
b/clang/lib/CodeGen/BackendUtil.cpp
index b09680086248d..ff7cc5a8e48ba 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -80,6 +80,7 @@
 #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
+#include "llvm/Transforms/Instrumentation/TypeSanitizer.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
 #include "llvm/Transforms/Scalar/GVN.h"
@@ -707,6 +708,11 @@ static void addSanitizers(const Triple &TargetTriple,
   MPM.addPass(createModuleToFunctionPassAdaptor(ThreadSanitizerPass()));
 }
 
+if (LangOpts.Sanitize.has(SanitizerKind::Type)) {
+  MPM.addPass(ModuleTypeSanitizerPass());
+  MPM.addPass(createModuleToFunctionPassAdaptor(TypeSanitizerPass()));
+}
+
 auto ASanPass = [&](SanitizerMask Mask, bool CompileKernel) {
   if (LangOpts.Sanitize.has(Mask)) {
 bool UseGlobalGC = asanUseGlobalsGC(TargetTriple, CodeGenOpts);
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 90aa4c0745a8a..4933f0c95fa8a 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -484,7 +484,8 @@ void CodeGenFunction::EmitStaticVarDecl(const VarDecl &D,
   LocalDeclMap.find(&D)->second = Address(castedAddr, elemTy, alignment);
   CGM.setStaticLocalDeclAddress(&D, castedAddr);
 
-  CGM.getSanitizerMetadata()->reportGlobal(var, D);
+  CGM.getSanitizerMetadata()->reportGlobalToASan(var, D);
+  CGM.getSanitizerMetadata()->reportGlobalToTySan(var, D);
 
   // Emit global variable debug descriptor for static vars.
   CGDebugInfo *DI = getDebugInfo();
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX

[llvm-branch-commits] [clang] [compiler-rt] [TySan] A Type Sanitizer (Runtime Library) (PR #76261)

2024-06-27 Thread Florian Hahn via llvm-branch-commits

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/76261

>From 733b3ed3f7441453889157834e0a5b6c288bf976 Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Thu, 27 Jun 2024 15:48:05 +0100
Subject: [PATCH] [tysan] Add runtime support

---
 clang/runtime/CMakeLists.txt  |   2 +-
 .../cmake/Modules/AllSupportedArchDefs.cmake  |   1 +
 compiler-rt/cmake/config-ix.cmake |  14 +-
 compiler-rt/lib/tysan/CMakeLists.txt  |  64 
 compiler-rt/lib/tysan/lit.cfg |  35 ++
 compiler-rt/lib/tysan/lit.site.cfg.in |  12 +
 compiler-rt/lib/tysan/tysan.cpp   | 339 ++
 compiler-rt/lib/tysan/tysan.h |  79 
 compiler-rt/lib/tysan/tysan.syms.extra|   2 +
 compiler-rt/lib/tysan/tysan_flags.inc |  17 +
 compiler-rt/lib/tysan/tysan_interceptors.cpp  | 250 +
 compiler-rt/lib/tysan/tysan_platform.h|  93 +
 compiler-rt/test/tysan/CMakeLists.txt |  32 ++
 compiler-rt/test/tysan/anon-ns.cpp|  41 +++
 compiler-rt/test/tysan/anon-same-struct.c |  26 ++
 compiler-rt/test/tysan/anon-struct.c  |  27 ++
 compiler-rt/test/tysan/basic.c|  65 
 compiler-rt/test/tysan/char-memcpy.c  |  45 +++
 compiler-rt/test/tysan/global.c   |  31 ++
 compiler-rt/test/tysan/int-long.c |  21 ++
 compiler-rt/test/tysan/lit.cfg.py | 139 +++
 compiler-rt/test/tysan/lit.site.cfg.py.in |  17 +
 compiler-rt/test/tysan/ptr-float.c|  19 +
 ...ruct-offset-multiple-compilation-units.cpp |  51 +++
 compiler-rt/test/tysan/struct-offset.c|  26 ++
 compiler-rt/test/tysan/struct.c   |  39 ++
 compiler-rt/test/tysan/union-wr-wr.c  |  18 +
 compiler-rt/test/tysan/violation-pr45282.c|  32 ++
 compiler-rt/test/tysan/violation-pr47137.c|  40 +++
 compiler-rt/test/tysan/violation-pr51837.c|  34 ++
 compiler-rt/test/tysan/violation-pr62544.c|  24 ++
 compiler-rt/test/tysan/violation-pr62828.cpp  |  44 +++
 compiler-rt/test/tysan/violation-pr68655.cpp  |  40 +++
 compiler-rt/test/tysan/violation-pr86685.c|  29 ++
 34 files changed, 1746 insertions(+), 2 deletions(-)
 create mode 100644 compiler-rt/lib/tysan/CMakeLists.txt
 create mode 100644 compiler-rt/lib/tysan/lit.cfg
 create mode 100644 compiler-rt/lib/tysan/lit.site.cfg.in
 create mode 100644 compiler-rt/lib/tysan/tysan.cpp
 create mode 100644 compiler-rt/lib/tysan/tysan.h
 create mode 100644 compiler-rt/lib/tysan/tysan.syms.extra
 create mode 100644 compiler-rt/lib/tysan/tysan_flags.inc
 create mode 100644 compiler-rt/lib/tysan/tysan_interceptors.cpp
 create mode 100644 compiler-rt/lib/tysan/tysan_platform.h
 create mode 100644 compiler-rt/test/tysan/CMakeLists.txt
 create mode 100644 compiler-rt/test/tysan/anon-ns.cpp
 create mode 100644 compiler-rt/test/tysan/anon-same-struct.c
 create mode 100644 compiler-rt/test/tysan/anon-struct.c
 create mode 100644 compiler-rt/test/tysan/basic.c
 create mode 100644 compiler-rt/test/tysan/char-memcpy.c
 create mode 100644 compiler-rt/test/tysan/global.c
 create mode 100644 compiler-rt/test/tysan/int-long.c
 create mode 100644 compiler-rt/test/tysan/lit.cfg.py
 create mode 100644 compiler-rt/test/tysan/lit.site.cfg.py.in
 create mode 100644 compiler-rt/test/tysan/ptr-float.c
 create mode 100644 
compiler-rt/test/tysan/struct-offset-multiple-compilation-units.cpp
 create mode 100644 compiler-rt/test/tysan/struct-offset.c
 create mode 100644 compiler-rt/test/tysan/struct.c
 create mode 100644 compiler-rt/test/tysan/union-wr-wr.c
 create mode 100644 compiler-rt/test/tysan/violation-pr45282.c
 create mode 100644 compiler-rt/test/tysan/violation-pr47137.c
 create mode 100644 compiler-rt/test/tysan/violation-pr51837.c
 create mode 100644 compiler-rt/test/tysan/violation-pr62544.c
 create mode 100644 compiler-rt/test/tysan/violation-pr62828.cpp
 create mode 100644 compiler-rt/test/tysan/violation-pr68655.cpp
 create mode 100644 compiler-rt/test/tysan/violation-pr86685.c

diff --git a/clang/runtime/CMakeLists.txt b/clang/runtime/CMakeLists.txt
index 65fcdc2868f03..ff2605b23d25b 100644
--- a/clang/runtime/CMakeLists.txt
+++ b/clang/runtime/CMakeLists.txt
@@ -122,7 +122,7 @@ if(LLVM_BUILD_EXTERNAL_COMPILER_RT AND EXISTS 
${COMPILER_RT_SRC_ROOT}/)
COMPONENT compiler-rt)
 
   # Add top-level targets that build specific compiler-rt runtimes.
-  set(COMPILER_RT_RUNTIMES fuzzer asan builtins dfsan lsan msan profile tsan 
ubsan ubsan-minimal)
+  set(COMPILER_RT_RUNTIMES fuzzer asan builtins dfsan lsan msan profile tsan 
tysan ubsan ubsan-minimal)
   foreach(runtime ${COMPILER_RT_RUNTIMES})
 get_ext_project_build_command(build_runtime_cmd ${runtime})
 add_custom_target(${runtime}
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake 
b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index ac4a71202384d..4701b58de4bda 1006

[llvm-branch-commits] [BOLT][NFC] Refactoring CallGraph (PR #96922)

2024-06-27 Thread shaw young via llvm-branch-commits

https://github.com/shawbyoung created 
https://github.com/llvm/llvm-project/pull/96922

Moved CallGraph and BinaryFunctionCallGraph from Passes to
Core for future use in stale matching.

Test Plan: n/a



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [BOLT][NFC] Refactoring CallGraph (PR #96922)

2024-06-27 Thread shaw young via llvm-branch-commits

https://github.com/shawbyoung edited 
https://github.com/llvm/llvm-project/pull/96922
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [BOLT][NFC] Refactoring CallGraph (PR #96922)

2024-06-27 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-bolt

Author: shaw young (shawbyoung)


Changes

Moved CallGraph and BinaryFunctionCallGraph from Passes to
Core for future use in stale matching.


---
Full diff: https://github.com/llvm/llvm-project/pull/96922.diff


18 Files Affected:

- (renamed) bolt/include/bolt/Core/BinaryFunctionCallGraph.h (+1-1) 
- (renamed) bolt/include/bolt/Core/CallGraph.h () 
- (renamed) bolt/include/bolt/Core/CallGraphWalker.h () 
- (modified) bolt/include/bolt/Passes/HFSort.h (+1-1) 
- (modified) bolt/include/bolt/Passes/RegReAssign.h (+1-1) 
- (modified) bolt/include/bolt/Passes/ReorderFunctions.h (+1-1) 
- (renamed) bolt/lib/Core/BinaryFunctionCallGraph.cpp (+1-1) 
- (modified) bolt/lib/Core/CMakeLists.txt (+3) 
- (renamed) bolt/lib/Core/CallGraph.cpp (+1-1) 
- (renamed) bolt/lib/Core/CallGraphWalker.cpp (+2-2) 
- (modified) bolt/lib/Passes/CMakeLists.txt (-3) 
- (modified) bolt/lib/Passes/FrameAnalysis.cpp (+1-1) 
- (modified) bolt/lib/Passes/FrameOptimizer.cpp (+1-1) 
- (modified) bolt/lib/Passes/IndirectCallPromotion.cpp (+1-1) 
- (modified) bolt/lib/Passes/JTFootprintReduction.cpp (+1-1) 
- (modified) bolt/lib/Passes/RegAnalysis.cpp (+1-1) 
- (modified) bolt/lib/Passes/RegReAssign.cpp (+1-1) 
- (modified) bolt/lib/Passes/StokeInfo.cpp (+1-1) 


``diff
diff --git a/bolt/include/bolt/Passes/BinaryFunctionCallGraph.h 
b/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
similarity index 98%
rename from bolt/include/bolt/Passes/BinaryFunctionCallGraph.h
rename to bolt/include/bolt/Core/BinaryFunctionCallGraph.h
index 52e17db4f50ce..4579c33985254 100644
--- a/bolt/include/bolt/Passes/BinaryFunctionCallGraph.h
+++ b/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
@@ -9,7 +9,7 @@
 #ifndef BOLT_PASSES_BINARY_FUNCTION_CALLGRAPH_H
 #define BOLT_PASSES_BINARY_FUNCTION_CALLGRAPH_H
 
-#include "bolt/Passes/CallGraph.h"
+#include "bolt/Core/CallGraph.h"
 #include 
 #include 
 #include 
diff --git a/bolt/include/bolt/Passes/CallGraph.h 
b/bolt/include/bolt/Core/CallGraph.h
similarity index 100%
rename from bolt/include/bolt/Passes/CallGraph.h
rename to bolt/include/bolt/Core/CallGraph.h
diff --git a/bolt/include/bolt/Passes/CallGraphWalker.h 
b/bolt/include/bolt/Core/CallGraphWalker.h
similarity index 100%
rename from bolt/include/bolt/Passes/CallGraphWalker.h
rename to bolt/include/bolt/Core/CallGraphWalker.h
diff --git a/bolt/include/bolt/Passes/HFSort.h 
b/bolt/include/bolt/Passes/HFSort.h
index 3787187733d07..2a35760329bb8 100644
--- a/bolt/include/bolt/Passes/HFSort.h
+++ b/bolt/include/bolt/Passes/HFSort.h
@@ -19,7 +19,7 @@
 #ifndef BOLT_PASSES_HFSORT_H
 #define BOLT_PASSES_HFSORT_H
 
-#include "bolt/Passes/CallGraph.h"
+#include "bolt/Core/CallGraph.h"
 
 #include 
 #include 
diff --git a/bolt/include/bolt/Passes/RegReAssign.h 
b/bolt/include/bolt/Passes/RegReAssign.h
index c50e32ff46e29..a7554a1215104 100644
--- a/bolt/include/bolt/Passes/RegReAssign.h
+++ b/bolt/include/bolt/Passes/RegReAssign.h
@@ -9,7 +9,7 @@
 #ifndef BOLT_PASSES_REGREASSIGN_H
 #define BOLT_PASSES_REGREASSIGN_H
 
-#include "bolt/Passes/BinaryFunctionCallGraph.h"
+#include "bolt/Core/BinaryFunctionCallGraph.h"
 #include "bolt/Passes/BinaryPasses.h"
 #include "bolt/Passes/RegAnalysis.h"
 
diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h 
b/bolt/include/bolt/Passes/ReorderFunctions.h
index 4c88142c58871..7da32324bc933 100644
--- a/bolt/include/bolt/Passes/ReorderFunctions.h
+++ b/bolt/include/bolt/Passes/ReorderFunctions.h
@@ -9,7 +9,7 @@
 #ifndef BOLT_PASSES_REORDER_FUNCTIONS_H
 #define BOLT_PASSES_REORDER_FUNCTIONS_H
 
-#include "bolt/Passes/BinaryFunctionCallGraph.h"
+#include "bolt/Core/BinaryFunctionCallGraph.h"
 #include "bolt/Passes/BinaryPasses.h"
 
 namespace llvm {
diff --git a/bolt/lib/Passes/BinaryFunctionCallGraph.cpp 
b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
similarity index 99%
rename from bolt/lib/Passes/BinaryFunctionCallGraph.cpp
rename to bolt/lib/Core/BinaryFunctionCallGraph.cpp
index bbcc9751c0cbe..86a31188c854a 100644
--- a/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
+++ b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
@@ -10,7 +10,7 @@
 //
 
//===--===//
 
-#include "bolt/Passes/BinaryFunctionCallGraph.h"
+#include "bolt/Core/BinaryFunctionCallGraph.h"
 #include "bolt/Core/BinaryContext.h"
 #include "bolt/Core/BinaryFunction.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt
index 873cf67a56291..bb58667066fd8 100644
--- a/bolt/lib/Core/CMakeLists.txt
+++ b/bolt/lib/Core/CMakeLists.txt
@@ -17,8 +17,11 @@ add_llvm_library(LLVMBOLTCore
   BinaryData.cpp
   BinaryEmitter.cpp
   BinaryFunction.cpp
+  BinaryFunctionCallGraph.cpp
   BinaryFunctionProfile.cpp
   BinarySection.cpp
+  CallGraph.cpp
+  CallGraphWalker.cpp
   DebugData.cpp
   DebugNames.cpp
   DIEBuilder.cpp
diff --git a/bolt/lib/Passes/CallGraph.cpp b/bolt/lib/Core/CallGraph.cpp
similarity index 9

[llvm-branch-commits] [BOLT][NFC] Refactoring CallGraph (PR #96922)

2024-06-27 Thread Amir Ayupov via llvm-branch-commits

https://github.com/aaupov commented:

Please build with shared libraries mode to ensure cross-component dependencies 
are satisfied.

https://github.com/llvm/llvm-project/pull/96922
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [BOLT][NFC] Refactoring CallGraph (PR #96922)

2024-06-27 Thread Amir Ayupov via llvm-branch-commits

https://github.com/aaupov edited https://github.com/llvm/llvm-project/pull/96922
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [BOLT][NFC] Refactoring CallGraph (PR #96922)

2024-06-27 Thread Amir Ayupov via llvm-branch-commits


@@ -10,7 +10,7 @@
 //
 
//===--===//
 
-#include "bolt/Passes/CallGraph.h"
+#include "bolt/Core/CallGraph.h"

aaupov wrote:

Please also update file headers (first line)

https://github.com/llvm/llvm-project/pull/96922
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT][NFC] Refactoring CallGraph (PR #96922)

2024-06-27 Thread shaw young via llvm-branch-commits

https://github.com/shawbyoung updated 
https://github.com/llvm/llvm-project/pull/96922

>From 84a2f69e71372891e2721552b10e0105b9430257 Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Thu, 27 Jun 2024 09:28:22 -0700
Subject: [PATCH] Updated file headers

Created using spr 1.3.4
---
 bolt/include/bolt/Core/BinaryFunctionCallGraph.h | 2 +-
 bolt/include/bolt/Core/CallGraph.h   | 2 +-
 bolt/include/bolt/Core/CallGraphWalker.h | 2 +-
 bolt/lib/Core/BinaryFunctionCallGraph.cpp| 2 +-
 bolt/lib/Core/CallGraph.cpp  | 2 +-
 bolt/lib/Core/CallGraphWalker.cpp| 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryFunctionCallGraph.h 
b/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
index 4579c33985254..4ff5b1b94c5e5 100644
--- a/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
+++ b/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.h --*- C++ 
-*-===//
+//===- bolt/Core/CallGraph.h --*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/include/bolt/Core/CallGraph.h 
b/bolt/include/bolt/Core/CallGraph.h
index bdbc50bb78e87..2fc18e61afcaa 100644
--- a/bolt/include/bolt/Core/CallGraph.h
+++ b/bolt/include/bolt/Core/CallGraph.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.h --*- C++ 
-*-===//
+//===- bolt/Core/CallGraph.h --*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/include/bolt/Core/CallGraphWalker.h 
b/bolt/include/bolt/Core/CallGraphWalker.h
index ac45644be362f..b0a73aee14369 100644
--- a/bolt/include/bolt/Core/CallGraphWalker.h
+++ b/bolt/include/bolt/Core/CallGraphWalker.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraphWalker.h *- C++ 
-*-===//
+//===- bolt/Core/CallGraphWalker.h *- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/BinaryFunctionCallGraph.cpp 
b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
index 86a31188c854a..b4b7897aa426a 100644
--- a/bolt/lib/Core/BinaryFunctionCallGraph.cpp
+++ b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/BinaryFunctionCallGraph.cpp 
===//
+//===- bolt/Core/BinaryFunctionCallGraph.cpp ===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/CallGraph.cpp b/bolt/lib/Core/CallGraph.cpp
index a7ea64fbbcf58..5f6bd11e9e97a 100644
--- a/bolt/lib/Core/CallGraph.cpp
+++ b/bolt/lib/Core/CallGraph.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.cpp 
--===//
+//===- bolt/Core/CallGraph.cpp --===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/CallGraphWalker.cpp 
b/bolt/lib/Core/CallGraphWalker.cpp
index 9d0087f79d17f..cbfa178d8e068 100644
--- a/bolt/lib/Core/CallGraphWalker.cpp
+++ b/bolt/lib/Core/CallGraphWalker.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraphWalker.cpp 
===//
+//===- bolt/Core/CallGraphWalker.cpp ===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Enable atomic optimizer for 64 bit divergent values (PR #96934)

2024-06-27 Thread Vikram Hegde via llvm-branch-commits

vikramRH wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96934?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96934** https://app.graphite.dev/github/pr/llvm/llvm-project/96934?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96933** https://app.graphite.dev/github/pr/llvm/llvm-project/96933?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @vikramRH and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96934
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT][NFC] Refactoring CallGraph (PR #96922)

2024-06-27 Thread Amir Ayupov via llvm-branch-commits

https://github.com/aaupov approved this pull request.

LGTM but please ensure that the diff passes NFC checks and shared build work.

https://github.com/llvm/llvm-project/pull/96922
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Enable atomic optimizer for 64 bit divergent values (PR #96934)

2024-06-27 Thread Vikram Hegde via llvm-branch-commits

https://github.com/vikramRH ready_for_review 
https://github.com/llvm/llvm-project/pull/96934
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Enable atomic optimizer for 64 bit divergent values (PR #96934)

2024-06-27 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-globalisel

Author: Vikram Hegde (vikramRH)


Changes



---

Patch is 1.18 MiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/96934.diff


11 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp (+17-5) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll 
(+1158-188) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll 
(+872-166) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll 
(+564-74) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll (+1138-194) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll 
(+486-18) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll 
(+414-18) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+2992-1062) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+1894-579) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+1894-579) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+2993-1063) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index cdd1953dca4ec..feffc3adb21b2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -178,6 +178,20 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
   return Changed;
 }
 
+static bool shouldOptimize(Type *Ty) {
+  switch (Ty->getTypeID()) {
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+return true;
+  case Type::IntegerTyID: {
+if (Ty->getIntegerBitWidth() == 32 || Ty->getIntegerBitWidth() == 64)
+  return true;
+  default:
+return false;
+  }
+  }
+}
+
 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
   // Early exit for unhandled address space atomic instructions.
   switch (I.getPointerAddressSpace()) {
@@ -230,8 +244,7 @@ void 
AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget, and the atomic operation is 32
   // bits.
-  if (ValDivergent &&
-  (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
+  if (ValDivergent && (!ST->hasDPP() || !shouldOptimize(I.getType( {
 return;
   }
 
@@ -313,8 +326,7 @@ void 
AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget, and the atomic operation is 32
   // bits.
-  if (ValDivergent &&
-  (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
+  if (ValDivergent && (!ST->hasDPP() || !shouldOptimize(I.getType( {
 return;
   }
 
@@ -745,7 +757,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction 
&I,
 // of each active lane in the wavefront. This will be our new value
 // which we will provide to the atomic operation.
 Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
-assert(TyBitWidth == 32);
+assert(TyBitWidth == 32 || TyBitWidth == 64);
 NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
  {NewV, LastLaneIdx});
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
index b058ad1023e13..8ad91f001bd72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
@@ -1,249 +1,1219 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs 
-stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs 
-stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a 
-amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs 
-stop-after=instruction-select < %s | FileCheck 
-check-prefixes=GFX90A,GFX90A_ITERATIVE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a 
-amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs 
-stop-after=instruction-select < %s | FileCheck 
-check-prefixes=GFX90A,GFX90A_DPP %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 
-amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs 
-stop-after=instruction-select < %s | FileCheck 
-check-prefixes=GFX940,GFX940_ITERATIVE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 
-amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs 
-stop-after=instruction-select < %s | FileCheck 
-check-prefixes=GFX940,GFX940_DPP %s
 
 de

[llvm-branch-commits] [flang] 5c45ad8 - Revert "[flang] add extra component information in fir.type_info (#96746)"

2024-06-27 Thread via llvm-branch-commits

Author: jeanPerier
Date: 2024-06-27T19:21:19+02:00
New Revision: 5c45ad8a20989bd9ca9fdf8148ce690dc28c834c

URL: 
https://github.com/llvm/llvm-project/commit/5c45ad8a20989bd9ca9fdf8148ce690dc28c834c
DIFF: 
https://github.com/llvm/llvm-project/commit/5c45ad8a20989bd9ca9fdf8148ce690dc28c834c.diff

LOG: Revert "[flang] add extra component information in fir.type_info (#96746)"

This reverts commit 1448ed2000ff0be17025dab0aad7412d054425eb.

Added: 


Modified: 
flang/include/flang/Optimizer/Builder/FIRBuilder.h
flang/include/flang/Optimizer/Dialect/FIROps.td
flang/include/flang/Optimizer/Support/InternalNames.h
flang/include/flang/Optimizer/Support/Utils.h
flang/lib/Lower/Bridge.cpp
flang/lib/Optimizer/Builder/FIRBuilder.cpp
flang/lib/Optimizer/Dialect/FIROps.cpp
flang/lib/Optimizer/Support/CMakeLists.txt
flang/lib/Optimizer/Support/InternalNames.cpp
flang/test/Fir/fir-ops.fir

Removed: 
flang/lib/Optimizer/Support/Utils.cpp
flang/test/Lower/HLFIR/type-info-components.f90



diff  --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h 
b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index ea35b298c0209..f9ef8b7566299 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -286,10 +286,6 @@ class FirOpBuilder : public mlir::OpBuilder, public 
mlir::OpBuilder::Listener {
   fir::StringLitOp createStringLitOp(mlir::Location loc,
  llvm::StringRef string);
 
-  std::pair
-  createTypeInfoOp(mlir::Location loc, fir::RecordType recordType,
-   fir::RecordType parentType);
-
   
//======//
   // Linkage helpers (inline). The default linkage is external.
   
//======//

diff  --git a/flang/include/flang/Optimizer/Dialect/FIROps.td 
b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 5b03806614f9b..baf095263479b 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2956,10 +2956,7 @@ def fir_TypeInfoOp : fir_Op<"type_info",
 
   let hasVerifier = 1;
 
-  let regions = (region
-MaxSizedRegion<1>:$dispatch_table,
-MaxSizedRegion<1>:$component_info
-  );
+  let regions = (region MaxSizedRegion<1>:$dispatch_table);
 
   let builders = [
 OpBuilder<(ins "fir::RecordType":$type, "fir::RecordType":$parent_type,
@@ -2970,7 +2967,6 @@ def fir_TypeInfoOp : fir_Op<"type_info",
 $sym_name (`noinit` $no_init^)? (`nodestroy` $no_destroy^)?
 (`nofinal` $no_final^)? (`extends` $parent_type^)? attr-dict `:` $type
 (`dispatch_table` $dispatch_table^)?
-(`component_info` $component_info^)?
   }];
 
   let extraClassDeclaration = [{
@@ -3014,24 +3010,6 @@ def fir_DTEntryOp : fir_Op<"dt_entry", 
[HasParent<"TypeInfoOp">]> {
   }];
 }
 
-def fir_DTComponentOp : fir_Op<"dt_component", [HasParent<"TypeInfoOp">]> {
-  let summary = "define extra information about a component inside 
fir.type_info";
-
-  let description = [{
-```
-  fir.dt_component i lbs [-1,2] init @init_val
-```
-  }];
-
-  let arguments = (ins
-StrAttr:$name,
-OptionalAttr:$lower_bounds,
-OptionalAttr:$init_val
-  );
-
-  let assemblyFormat = "$name (`lbs` $lower_bounds^)? (`init` $init_val^)? 
attr-dict";
-}
-
 def fir_AbsentOp : fir_OneResultOp<"absent", [NoMemoryEffect]> {
   let summary = "create value to be passed for absent optional function 
argument";
   let description = [{

diff  --git a/flang/include/flang/Optimizer/Support/InternalNames.h 
b/flang/include/flang/Optimizer/Support/InternalNames.h
index ff23510922372..23a03854c4abd 100644
--- a/flang/include/flang/Optimizer/Support/InternalNames.h
+++ b/flang/include/flang/Optimizer/Support/InternalNames.h
@@ -15,7 +15,6 @@
 #include 
 
 static constexpr llvm::StringRef typeDescriptorSeparator = ".dt.";
-static constexpr llvm::StringRef componentInitSeparator = ".di.";
 static constexpr llvm::StringRef bindingTableSeparator = ".v.";
 static constexpr llvm::StringRef boxprocSuffix = "UnboxProc";
 
@@ -157,11 +156,6 @@ struct NameUniquer {
   static std::string
   getTypeDescriptorBindingTableName(llvm::StringRef mangledTypeName);
 
-  /// Given a mangled derived type name and a component name, get the name of
-  /// the global object containing the component default initialization.
-  static std::string getComponentInitName(llvm::StringRef mangledTypeName,
-  llvm::StringRef componentName);
-
   /// Remove markers that have been added when doing partial type
   /// conversions. mlir::Type cannot be mutated in a pass, so new
   /// fir::RecordType must be created when lowering member types.

diff  --git a/flang/include/flang/Optimizer/Support/Utils.h 
b/flang/include/fl

[llvm-branch-commits] [llvm] [AMDGPU] Enable atomic optimizer for 64 bit divergent values (PR #96934)

2024-06-27 Thread Christudasan Devadasan via llvm-branch-commits


@@ -178,6 +178,20 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
   return Changed;
 }
 
+static bool shouldOptimizeForType(Type *Ty) {
+  switch (Ty->getTypeID()) {
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+return true;
+  case Type::IntegerTyID: {
+if (Ty->getIntegerBitWidth() == 32 || Ty->getIntegerBitWidth() == 64)

cdevadas wrote:

Get Ty->getIntegerBitWidth() just once outside?

https://github.com/llvm/llvm-project/pull/96934
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [Hashing] Use a non-deterministic seed if LLVM_ENABLE_ABI_BREAKING_CHECKS (PR #96282)

2024-06-27 Thread Fangrui Song via llvm-branch-commits

https://github.com/MaskRay updated 
https://github.com/llvm/llvm-project/pull/96282


___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [Hashing] Use a non-deterministic seed if LLVM_ENABLE_ABI_BREAKING_CHECKS (PR #96282)

2024-06-27 Thread Fangrui Song via llvm-branch-commits

https://github.com/MaskRay edited 
https://github.com/llvm/llvm-project/pull/96282
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [Hashing] Use a non-deterministic seed if LLVM_ENABLE_ABI_BREAKING_CHECKS (PR #96282)

2024-06-27 Thread Fangrui Song via llvm-branch-commits

https://github.com/MaskRay updated 
https://github.com/llvm/llvm-project/pull/96282


___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [Hashing] Use a non-deterministic seed if LLVM_ENABLE_ABI_BREAKING_CHECKS (PR #96282)

2024-06-27 Thread Fangrui Song via llvm-branch-commits

MaskRay wrote:

The non-determinism is now restricted to `LLVM_ENABLE_ABI_BREAKING_CHECKS` 
builds.

LLVM_ENABLE_ABI_BREAKING_CHECKS defaults to `WITH_ASSERTS` . Release builds 
that disable assertions disable `LLVM_ENABLE_ABI_BREAKING_CHECKS`. This change 
yields a slight code size/performance advantage by eliminating a variable read.



https://github.com/llvm/llvm-project/pull/96282
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Enable atomic optimizer for 64 bit divergent values (PR #96934)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits


@@ -178,6 +178,20 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
   return Changed;
 }
 
+static bool shouldOptimizeForType(Type *Ty) {

arsenm wrote:

Name this something that states why this should be handled, not a vague 
"shouldOptimize" 

https://github.com/llvm/llvm-project/pull/96934
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Enable atomic optimizer for 64 bit divergent values (PR #96934)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits


@@ -178,6 +178,20 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
   return Changed;
 }
 
+static bool shouldOptimizeForType(Type *Ty) {
+  switch (Ty->getTypeID()) {
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+return true;
+  case Type::IntegerTyID: {
+if (Ty->getIntegerBitWidth() == 32 || Ty->getIntegerBitWidth() == 64)
+  return true;
+  default:

arsenm wrote:

Don't forget pointers. In a follow up the should really just handle half / 
bfloat and vectors 

https://github.com/llvm/llvm-project/pull/96934
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT][NFC] Refactoring CallGraph (PR #96922)

2024-06-27 Thread Amir Ayupov via llvm-branch-commits

aaupov wrote:

Please also retitle as an imperative statement, e.g. "Move CallGraph from 
Passes to Core"

https://github.com/llvm/llvm-project/pull/96922
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT][NFC] Move CallGraph from Passes to Core (PR #96922)

2024-06-27 Thread shaw young via llvm-branch-commits

https://github.com/shawbyoung edited 
https://github.com/llvm/llvm-project/pull/96922
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT][NFC] Move CallGraph from Passes to Core (PR #96922)

2024-06-27 Thread shaw young via llvm-branch-commits

https://github.com/shawbyoung updated 
https://github.com/llvm/llvm-project/pull/96922

>From 84a2f69e71372891e2721552b10e0105b9430257 Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Thu, 27 Jun 2024 09:28:22 -0700
Subject: [PATCH 1/2] Updated file headers

Created using spr 1.3.4
---
 bolt/include/bolt/Core/BinaryFunctionCallGraph.h | 2 +-
 bolt/include/bolt/Core/CallGraph.h   | 2 +-
 bolt/include/bolt/Core/CallGraphWalker.h | 2 +-
 bolt/lib/Core/BinaryFunctionCallGraph.cpp| 2 +-
 bolt/lib/Core/CallGraph.cpp  | 2 +-
 bolt/lib/Core/CallGraphWalker.cpp| 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryFunctionCallGraph.h 
b/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
index 4579c33985254..4ff5b1b94c5e5 100644
--- a/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
+++ b/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.h --*- C++ 
-*-===//
+//===- bolt/Core/CallGraph.h --*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/include/bolt/Core/CallGraph.h 
b/bolt/include/bolt/Core/CallGraph.h
index bdbc50bb78e87..2fc18e61afcaa 100644
--- a/bolt/include/bolt/Core/CallGraph.h
+++ b/bolt/include/bolt/Core/CallGraph.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.h --*- C++ 
-*-===//
+//===- bolt/Core/CallGraph.h --*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/include/bolt/Core/CallGraphWalker.h 
b/bolt/include/bolt/Core/CallGraphWalker.h
index ac45644be362f..b0a73aee14369 100644
--- a/bolt/include/bolt/Core/CallGraphWalker.h
+++ b/bolt/include/bolt/Core/CallGraphWalker.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraphWalker.h *- C++ 
-*-===//
+//===- bolt/Core/CallGraphWalker.h *- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/BinaryFunctionCallGraph.cpp 
b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
index 86a31188c854a..b4b7897aa426a 100644
--- a/bolt/lib/Core/BinaryFunctionCallGraph.cpp
+++ b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/BinaryFunctionCallGraph.cpp 
===//
+//===- bolt/Core/BinaryFunctionCallGraph.cpp ===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/CallGraph.cpp b/bolt/lib/Core/CallGraph.cpp
index a7ea64fbbcf58..5f6bd11e9e97a 100644
--- a/bolt/lib/Core/CallGraph.cpp
+++ b/bolt/lib/Core/CallGraph.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.cpp 
--===//
+//===- bolt/Core/CallGraph.cpp --===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/CallGraphWalker.cpp 
b/bolt/lib/Core/CallGraphWalker.cpp
index 9d0087f79d17f..cbfa178d8e068 100644
--- a/bolt/lib/Core/CallGraphWalker.cpp
+++ b/bolt/lib/Core/CallGraphWalker.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraphWalker.cpp 
===//
+//===- bolt/Core/CallGraphWalker.cpp ===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

>From 1980f9bafd39ec84e71d71bd6a50d3368e1dbbe4 Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Thu, 27 Jun 2024 11:27:43 -0700
Subject: [PATCH 2/2] clang-format

Created using spr 1.3.4
---
 bolt/lib/Core/CallGraph.cpp| 14 +++---
 bolt/lib/Passes/FrameAnalysis.cpp  |  2 +-
 bolt/lib/Passes/FrameOptimizer.cpp |  2 +-
 bolt/lib/Passes/RegReAssign.cpp|  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/bolt/lib/Core/CallGraph.cpp b/bolt/lib/Core/CallGraph.cpp
index 5f6bd11e9e97a..f1d52737bf556 100644
--- a/bolt/lib/Core/CallGraph.cpp
+++ b/bolt/lib/Core/CallGraph.cpp
@@ -15,11 +15,11 @@
 #define DEBUG_TYPE "callgraph"
 
 #if defined(__x86_64__) && !defined(_MSC_VER)
-#  if (!defined USE_SSECRC)
-#define USE_SSECRC
-#  endif
+#if (!defined USE_SSECRC)
+#define USE_SSECRC
+#endif
 #else
-#  undef USE_SSECRC
+#undef USE_SSECRC
 #endif
 
 static LLVM_ATTRIBUTE_UNUSED inline size_t hash_int64_fallback(int64_t k) {
@@ -50,7 +50,7 @@ static inline size_t hash_int6

[llvm-branch-commits] [clang] [clang][test] add testing for the AST matcher reference (PR #94248)

2024-06-27 Thread Aaron Ballman via llvm-branch-commits

https://github.com/AaronBallman commented:

The changes generally LGTM, though I would appreciate a second set of eyes on 
the CMake and Python changes because I have a bit less confidence in my review 
abilities there.

Thank you for adding the documentation to the header file, I think that will 
help folks when working on their own matchers.

One question I have is: do you happen to know how this impacts build times for 
Clang itself? I'm assuming that if ASTMatchers.h isn't modified, CMake won't 
re-run `generate_ast_matcher_doc_tests.py` and so the compile time performance 
hit is only on full rebuilds or when changing the header?

https://github.com/llvm/llvm-project/pull/94248
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [lldb] 148a109 - Revert "Revert "[lldb/test] Mark TestStepScripted.py as XFAIL temporarily (#9…"

2024-06-27 Thread via llvm-branch-commits

Author: Med Ismail Bennani
Date: 2024-06-27T11:34:19-07:00
New Revision: 148a109bcd1592032bdda31694717bbeef5a976d

URL: 
https://github.com/llvm/llvm-project/commit/148a109bcd1592032bdda31694717bbeef5a976d
DIFF: 
https://github.com/llvm/llvm-project/commit/148a109bcd1592032bdda31694717bbeef5a976d.diff

LOG: Revert "Revert "[lldb/test] Mark TestStepScripted.py as XFAIL temporarily 
(#9…"

This reverts commit b949b6420775fe3466dc5a5bf34eab1d14e39e8f.

Added: 


Modified: 
lldb/test/API/functionalities/step_scripted/Steps.py
lldb/test/API/functionalities/step_scripted/TestStepScripted.py

Removed: 




diff  --git a/lldb/test/API/functionalities/step_scripted/Steps.py 
b/lldb/test/API/functionalities/step_scripted/Steps.py
index b121f71538ce4..3325dba753657 100644
--- a/lldb/test/API/functionalities/step_scripted/Steps.py
+++ b/lldb/test/API/functionalities/step_scripted/Steps.py
@@ -92,7 +92,6 @@ def should_stop(self, event):
 
 def stop_description(self, stream):
 stream.Print(f"Stepped until {self.var_name} changed.")
-return True
 
 
 # This plan does nothing, but sets stop_mode to the

diff  --git a/lldb/test/API/functionalities/step_scripted/TestStepScripted.py 
b/lldb/test/API/functionalities/step_scripted/TestStepScripted.py
index 53901718019f9..bb7479414dbbb 100644
--- a/lldb/test/API/functionalities/step_scripted/TestStepScripted.py
+++ b/lldb/test/API/functionalities/step_scripted/TestStepScripted.py
@@ -7,7 +7,6 @@
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 
-
 class StepScriptedTestCase(TestBase):
 NO_DEBUG_INFO_TESTCASE = True
 
@@ -16,12 +15,14 @@ def setUp(self):
 self.main_source_file = lldb.SBFileSpec("main.c")
 self.runCmd("command script import Steps.py")
 
+@expectedFailureAll()
 def test_standard_step_out(self):
 """Tests stepping with the scripted thread plan laying over a standard
 thread plan for stepping out."""
 self.build()
 self.step_out_with_scripted_plan("Steps.StepOut")
 
+@expectedFailureAll()
 def test_scripted_step_out(self):
 """Tests stepping with the scripted thread plan laying over an another
 scripted thread plan for stepping out."""
@@ -62,10 +63,12 @@ def test_misspelled_plan_name(self):
 # Make sure we didn't let the process run:
 self.assertEqual(stop_id, process.GetStopID(), "Process didn't run")
 
+@expectedFailureAll()
 def test_checking_variable(self):
 """Test that we can call SBValue API's from a scripted thread plan - 
using SBAPI's to step"""
 self.do_test_checking_variable(False)
 
+@expectedFailureAll()
 def test_checking_variable_cli(self):
 """Test that we can call SBValue API's from a scripted thread plan - 
using cli to step"""
 self.do_test_checking_variable(True)



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [Hashing] Use a non-deterministic seed if LLVM_ENABLE_ABI_BREAKING_CHECKS (PR #96282)

2024-06-27 Thread Nikita Popov via llvm-branch-commits


@@ -322,24 +306,20 @@ struct hash_state {
   }
 };
 
-
-/// A global, fixed seed-override variable.
-///
-/// This variable can be set using the \see llvm::set_fixed_execution_seed
-/// function. See that function for details. Do not, under any circumstances,
-/// set or read this variable.
-extern uint64_t fixed_seed_override;
-
+/// In LLVM_ENABLE_ABI_BREAKING_CHECKS builds, the seed is non-deterministic
+/// (address of a variable) to prevent having users depend on the particular
+/// hash values. On platforms without ASLR, this is still likely
+/// non-deterministic per build.
 inline uint64_t get_execution_seed() {
-  // FIXME: This needs to be a per-execution seed. This is just a placeholder
-  // implementation. Switching to a per-execution seed is likely to flush out
-  // instability bugs and so will happen as its own commit.
-  //
-  // However, if there is a fixed seed override set the first time this is
-  // called, return that instead of the per-execution seed.
-  const uint64_t seed_prime = 0xff51afd7ed558ccdULL;
-  static uint64_t seed = fixed_seed_override ? fixed_seed_override : 
seed_prime;
-  return seed;
+  [[maybe_unused]] static const char seed = 0;

nikic wrote:

Move this inside the `#if`?

https://github.com/llvm/llvm-project/pull/96282
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [Hashing] Use a non-deterministic seed if LLVM_ENABLE_ABI_BREAKING_CHECKS (PR #96282)

2024-06-27 Thread Nikita Popov via llvm-branch-commits

https://github.com/nikic approved this pull request.

LGTM, but please wait a bit in case there is more feedback.

https://github.com/llvm/llvm-project/pull/96282
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [Hashing] Use a non-deterministic seed if LLVM_ENABLE_ABI_BREAKING_CHECKS (PR #96282)

2024-06-27 Thread Fangrui Song via llvm-branch-commits

https://github.com/MaskRay updated 
https://github.com/llvm/llvm-project/pull/96282

>From a341e03cb6376d50a4fa219933d3f161e41a567a Mon Sep 17 00:00:00 2001
From: Fangrui Song 
Date: Thu, 27 Jun 2024 14:44:02 -0700
Subject: [PATCH] move seed inside #if

Created using spr 1.3.5-bogner
---
 llvm/include/llvm/ADT/Hashing.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/Hashing.h b/llvm/include/llvm/ADT/Hashing.h
index 397109880bb02..177fb0318bf80 100644
--- a/llvm/include/llvm/ADT/Hashing.h
+++ b/llvm/include/llvm/ADT/Hashing.h
@@ -311,11 +311,11 @@ struct hash_state {
 /// hash values. On platforms without ASLR, this is still likely
 /// non-deterministic per build.
 inline uint64_t get_execution_seed() {
-  [[maybe_unused]] static const char seed = 0;
   // Work around x86-64 negative offset folding for old Clang -fno-pic
   // https://reviews.llvm.org/D93931
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS && 
\
 (!defined(__clang__) || __clang_major__ > 11)
+  static const char seed = 0;
   return static_cast(reinterpret_cast(&seed));
 #else
   return 0xff51afd7ed558ccdULL;

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT] Name similarity function matching (PR #95884)

2024-06-27 Thread Maksim Panchenko via llvm-branch-commits

https://github.com/maksfb commented:

Please refactor new code into a separate function. Add a comment on how the 
matching is done such that the interface can be understood without reading the 
code.

https://github.com/llvm/llvm-project/pull/95884
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT] Name similarity function matching (PR #95884)

2024-06-27 Thread Maksim Panchenko via llvm-branch-commits

https://github.com/maksfb edited https://github.com/llvm/llvm-project/pull/95884
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT] Name similarity function matching (PR #95884)

2024-06-27 Thread Maksim Panchenko via llvm-branch-commits


@@ -23,6 +26,11 @@ extern cl::opt Verbosity;
 extern cl::OptionCategory BoltOptCategory;
 extern cl::opt InferStaleProfile;
 
+cl::opt NameSimilarityFunctionMatchingThreshold(
+"name-similarity-function-matching-threshold",
+cl::desc("Matches functions using namespace and edit distance."),

maksfb wrote:

nit: use imperative statement.

https://github.com/llvm/llvm-project/pull/95884
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT] Name similarity function matching (PR #95884)

2024-06-27 Thread Maksim Panchenko via llvm-branch-commits


@@ -0,0 +1,64 @@
+## Tests function matching in YAMLProfileReader by name similarity.
+
+# REQUIRES: system-linux
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
+# RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml -v=2 \
+# RUN:   --print-cfg --name-similarity-function-matching-threshold=1 2>&1 
--funcs=main --profile-ignore-hash=0 | FileCheck %s

maksfb wrote:

```suggestion
# RUN:   --print-cfg --name-similarity-function-matching-threshold=1 
--funcs=main --profile-ignore-hash=0 2>&1 | FileCheck %s
```

https://github.com/llvm/llvm-project/pull/95884
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT] Name similarity function matching (PR #95884)

2024-06-27 Thread Maksim Panchenko via llvm-branch-commits


@@ -0,0 +1,64 @@
+## Tests function matching in YAMLProfileReader by name similarity.
+
+# REQUIRES: system-linux
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
+# RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml -v=2 \
+# RUN:   --print-cfg --name-similarity-function-matching-threshold=1 2>&1 
--funcs=main --profile-ignore-hash=0 | FileCheck %s
+
+# CHECK: BOLT-INFO: matched 1 functions with similar names
+
+#--- main.s
+.globl main
+.type  main, @function
+main:
+  .cfi_startproc
+.LBB00:
+  pushq   %rbp
+  movq%rsp, %rbp
+  subq$16, %rsp
+  testq   %rax, %rax
+  js  .LBB03
+.LBB01:
+  jne .LBB04
+.LBB02:
+  nop
+.LBB03:
+  xorl%eax, %eax
+  addq$16, %rsp
+  popq%rbp
+  retq
+.LBB04:
+  xorl%eax, %eax
+  addq$16, %rsp
+  popq%rbp
+  retq
+## For relocations against .text
+.LBB05:
+  call exit

maksfb wrote:

See comments on the other PR.

https://github.com/llvm/llvm-project/pull/95884
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT] Name similarity function matching (PR #95884)

2024-06-27 Thread Maksim Panchenko via llvm-branch-commits


@@ -415,11 +423,116 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
 if (!YamlBF.Used && BF && !ProfiledFunctions.count(BF))
   matchProfileToFunction(YamlBF, *BF);
 
+  // Uses name similarity to match functions that were not matched by name.
+  uint64_t MatchedWithNameSimilarity = 0;
+
+  if (opts::NameSimilarityFunctionMatchingThreshold > 0) {
+ItaniumPartialDemangler ItaniumPartialDemangler;
+
+auto DemangleName = [&](std::string &FunctionName) {
+  StringRef RestoredName = NameResolver::restore(FunctionName);
+  return demangle(RestoredName);
+};
+
+auto DeriveNameSpace = [&](std::string &DemangledName) {
+  if (ItaniumPartialDemangler.partialDemangle(DemangledName.c_str()))
+return std::string("");
+  std::vector Buffer(DemangledName.begin(), DemangledName.end());
+  size_t BufferSize = Buffer.size();
+  char *NameSpace = ItaniumPartialDemangler.getFunctionDeclContextName(
+  &Buffer[0], &BufferSize);
+  return NameSpace ? std::string(NameSpace) : std::string("");
+};
+
+// Preprocessing YamlBFs to minimize the number of BFs to process.
+std::unordered_map>

maksfb wrote:

Can you use `StringMap` here? 
https://llvm.org/docs/ProgrammersManual.html#llvm-adt-stringmap-h

https://github.com/llvm/llvm-project/pull/95884
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT] Name similarity function matching (PR #95884)

2024-06-27 Thread Maksim Panchenko via llvm-branch-commits


@@ -415,11 +423,116 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
 if (!YamlBF.Used && BF && !ProfiledFunctions.count(BF))
   matchProfileToFunction(YamlBF, *BF);
 
+  // Uses name similarity to match functions that were not matched by name.
+  uint64_t MatchedWithNameSimilarity = 0;
+
+  if (opts::NameSimilarityFunctionMatchingThreshold > 0) {
+ItaniumPartialDemangler ItaniumPartialDemangler;
+
+auto DemangleName = [&](std::string &FunctionName) {
+  StringRef RestoredName = NameResolver::restore(FunctionName);
+  return demangle(RestoredName);
+};
+
+auto DeriveNameSpace = [&](std::string &DemangledName) {
+  if (ItaniumPartialDemangler.partialDemangle(DemangledName.c_str()))
+return std::string("");
+  std::vector Buffer(DemangledName.begin(), DemangledName.end());
+  size_t BufferSize = Buffer.size();
+  char *NameSpace = ItaniumPartialDemangler.getFunctionDeclContextName(
+  &Buffer[0], &BufferSize);
+  return NameSpace ? std::string(NameSpace) : std::string("");
+};
+
+// Preprocessing YamlBFs to minimize the number of BFs to process.
+std::unordered_map>
+  NamespaceToProfiledBFSizes;
+NamespaceToProfiledBFSizes.reserve(YamlBP.Functions.size());
+std::vector ProfileBFDemangledNames;
+ProfileBFDemangledNames.reserve(YamlBP.Functions.size());
+std::vector ProfiledBFNamespaces;
+ProfiledBFNamespaces.reserve(YamlBP.Functions.size());
+
+for (auto &YamlBF : YamlBP.Functions) {
+  std::string YamlBFDemangledName = DemangleName(YamlBF.Name);
+  ProfileBFDemangledNames.push_back(YamlBFDemangledName);
+  std::string YamlBFNamespace = DeriveNameSpace(YamlBFDemangledName);
+  ProfiledBFNamespaces.push_back(YamlBFNamespace);
+  
NamespaceToProfiledBFSizes[YamlBFNamespace].insert(YamlBF.NumBasicBlocks);
+}
+
+std::unordered_map>

maksfb wrote:

Same for `StringMap`.

https://github.com/llvm/llvm-project/pull/95884
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [Hashing] Use a non-deterministic seed if LLVM_ENABLE_ABI_BREAKING_CHECKS (PR #96282)

2024-06-27 Thread Fangrui Song via llvm-branch-commits

MaskRay wrote:

https://llvm-compile-time-tracker.com/compare.php?from=abfff89b743584d2796000318198bf60d3622a1f&to=5c2a6b5ba62d2b7ed2c0ad3be29fba8558f5627b&stat=instructions:u

stage2-O3: `instruction:u` change (larger than expected):

|Benchmark|Old|New|
|--- |--- |--- |
|kimwitu++|38847M|38705M (-0.37%)|
|sqlite3|35002M|34917M (-0.24%)|
|consumer-typeset|31851M|31794M (-0.18%)|
|Bullet|93072M|92815M (-0.28%)|
|tramp3d-v4|78154M|77899M (-0.33%)|
|mafft|32841M|32718M (-0.37%)|
|ClamAV|50244M|50128M (-0.23%)|
|lencod|60998M|60793M (-0.34%)|
|SPASS|42834M|42725M (-0.25%)|
|7zip|191523M|191032M (-0.26%)|
|geomean|55146M|54990M (-0.28%)|


https://github.com/llvm/llvm-project/pull/96282
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT] Function matching with function calls as anchors (PR #96596)

2024-06-27 Thread shaw young via llvm-branch-commits

https://github.com/shawbyoung updated 
https://github.com/llvm/llvm-project/pull/96596

>From 05d59574d6260b98a469921eb2fccf5398bfafb6 Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Mon, 24 Jun 2024 23:00:59 -0700
Subject: [PATCH 1/5] Added call to matchWithCallsAsAnchors

Created using spr 1.3.4
---
 bolt/lib/Profile/YAMLProfileReader.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp 
b/bolt/lib/Profile/YAMLProfileReader.cpp
index aafffac3d4b1c..1a0e5d239d252 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -479,6 +479,9 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
 if (!YamlBF.Used && BF && !ProfiledFunctions.count(BF))
   matchProfileToFunction(YamlBF, *BF);
 
+  uint64_t MatchedWithCallsAsAnchors = 0;
+  matchWithCallsAsAnchors(BC,  MatchedWithCallsAsAnchors);
+
   for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions)
 if (!YamlBF.Used && opts::Verbosity >= 1)
   errs() << "BOLT-WARNING: profile ignored for function " << YamlBF.Name

>From 77ef0008f4f5987719555e6cc3e32da812ae0f31 Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Mon, 24 Jun 2024 23:11:43 -0700
Subject: [PATCH 2/5] Changed CallHashToBF representation

Created using spr 1.3.4
---
 bolt/lib/Profile/YAMLProfileReader.cpp | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp 
b/bolt/lib/Profile/YAMLProfileReader.cpp
index 1a0e5d239d252..91b01a99c7485 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -29,6 +29,10 @@ static llvm::cl::opt
cl::desc("ignore hash while reading function profile"),
cl::Hidden, cl::cat(BoltOptCategory));
 
+llvm::cl::opt MatchWithCallsAsAnchors("match-with-calls-as-anchors",
+  cl::desc("Matches with calls as anchors"),
+  cl::Hidden, cl::cat(BoltOptCategory));
+
 llvm::cl::opt ProfileUseDFS("profile-use-dfs",
   cl::desc("use DFS order for YAML profile"),
   cl::Hidden, cl::cat(BoltOptCategory));
@@ -353,7 +357,7 @@ void YAMLProfileReader::matchWithCallsAsAnchors(
 llvm_unreachable("Unhandled HashFunction");
   };
 
-  std::unordered_map CallHashToBF;
+  std::unordered_map CallHashToBF;
 
   for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
 if (ProfiledFunctions.count(BF))
@@ -375,12 +379,12 @@ void YAMLProfileReader::matchWithCallsAsAnchors(
   for (const std::string &FunctionName : FunctionNames)
 HashString.append(FunctionName);
 }
-CallHashToBF.emplace(ComputeCallHash(HashString), BF);
+CallHashToBF[ComputeCallHash(HashString)] = BF;
   }
 
   std::unordered_map ProfiledFunctionIdToName;
 
-  for (const yaml::bolt::BinaryFunctionProfile YamlBF : YamlBP.Functions)
+  for (const yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions)
 ProfiledFunctionIdToName[YamlBF.Id] = YamlBF.Name;
 
   for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
@@ -401,7 +405,7 @@ void YAMLProfileReader::matchWithCallsAsAnchors(
 auto It = CallHashToBF.find(Hash);
 if (It == CallHashToBF.end())
   continue;
-matchProfileToFunction(YamlBF, It->second);
+matchProfileToFunction(YamlBF, *It->second);
 ++MatchedWithCallsAsAnchors;
   }
 }
@@ -480,7 +484,8 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
   matchProfileToFunction(YamlBF, *BF);
 
   uint64_t MatchedWithCallsAsAnchors = 0;
-  matchWithCallsAsAnchors(BC,  MatchedWithCallsAsAnchors);
+  if (opts::MatchWithCallsAsAnchors)
+matchWithCallsAsAnchors(BC,  MatchedWithCallsAsAnchors);
 
   for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions)
 if (!YamlBF.Used && opts::Verbosity >= 1)

>From ea7cb68ab9e8e158412c2e752986968968a60d93 Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Tue, 25 Jun 2024 09:28:39 -0700
Subject: [PATCH 3/5] Changed BF called FunctionNames to multiset

Created using spr 1.3.4
---
 bolt/lib/Profile/YAMLProfileReader.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp 
b/bolt/lib/Profile/YAMLProfileReader.cpp
index 91b01a99c7485..3b3d73f7af023 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -365,7 +365,7 @@ void YAMLProfileReader::matchWithCallsAsAnchors(
 
 std::string HashString;
 for (const auto &BB : BF->blocks()) {
-  std::set FunctionNames;
+  std::multiset FunctionNames;
   for (const MCInst &Instr : BB) {
 // Skip non-call instructions.
 if (!BC.MIB->isCall(Instr))
@@ -397,9 +397,8 @@ void YAMLProfileReader::matchWithCallsAsAnchors(
 std::string &FunctionName = ProfiledFunctionIdToName[CallSite.DestId];
 FunctionNames.insert(FunctionName);
   

[llvm-branch-commits] [llvm] [Hashing] Use a non-deterministic seed if LLVM_ENABLE_ABI_BREAKING_CHECKS (PR #96282)

2024-06-27 Thread Eli Friedman via llvm-branch-commits

https://github.com/efriedma-quic edited 
https://github.com/llvm/llvm-project/pull/96282
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [Hashing] Use a non-deterministic seed if LLVM_ENABLE_ABI_BREAKING_CHECKS (PR #96282)

2024-06-27 Thread Eli Friedman via llvm-branch-commits


@@ -322,24 +306,20 @@ struct hash_state {
   }
 };
 
-
-/// A global, fixed seed-override variable.
-///
-/// This variable can be set using the \see llvm::set_fixed_execution_seed
-/// function. See that function for details. Do not, under any circumstances,
-/// set or read this variable.
-extern uint64_t fixed_seed_override;
-
+/// In LLVM_ENABLE_ABI_BREAKING_CHECKS builds, the seed is non-deterministic
+/// (address of a variable) to prevent having users depend on the particular
+/// hash values. On platforms without ASLR, this is still likely
+/// non-deterministic per build.
 inline uint64_t get_execution_seed() {
-  // FIXME: This needs to be a per-execution seed. This is just a placeholder
-  // implementation. Switching to a per-execution seed is likely to flush out
-  // instability bugs and so will happen as its own commit.
-  //
-  // However, if there is a fixed seed override set the first time this is
-  // called, return that instead of the per-execution seed.
-  const uint64_t seed_prime = 0xff51afd7ed558ccdULL;
-  static uint64_t seed = fixed_seed_override ? fixed_seed_override : 
seed_prime;
-  return seed;
+  // Work around x86-64 negative offset folding for old Clang -fno-pic
+  // https://reviews.llvm.org/D93931
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS && 
\
+(!defined(__clang__) || __clang_major__ > 11)

efriedma-quic wrote:

Is it an ABI problem that this ifdef exists?  I mean, LLVM libraries built with 
clang<11 can't be used by programs built with clang>11.  With 
LLVM_ENABLE_ABI_BREAKING_CHECKS, I guess it's unlikely to cause issues, though. 
 (I guess you could use an empty inline asm as a workaround if you wanted to.)

https://github.com/llvm/llvm-project/pull/96282
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [Hashing] Use a non-deterministic seed if LLVM_ENABLE_ABI_BREAKING_CHECKS (PR #96282)

2024-06-27 Thread Eli Friedman via llvm-branch-commits

https://github.com/efriedma-quic commented:

I think I'm happier restricting the non-determinism to +Asserts for now, at 
least as an incremental step.

> Due to Avalanche effects, even a few ASLR bits are sufficient to cover many 
> different scenarios and expose latent bugs.

On Windows specifically, I'm less concerned about the total number of bits, and 
more concerned that ASLR isn't randomized for each run of an executable.

https://github.com/llvm/llvm-project/pull/96282
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT][NFC] Move CallGraph from Passes to Core (PR #96922)

2024-06-27 Thread shaw young via llvm-branch-commits

https://github.com/shawbyoung updated 
https://github.com/llvm/llvm-project/pull/96922

>From 84a2f69e71372891e2721552b10e0105b9430257 Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Thu, 27 Jun 2024 09:28:22 -0700
Subject: [PATCH 1/2] Updated file headers

Created using spr 1.3.4
---
 bolt/include/bolt/Core/BinaryFunctionCallGraph.h | 2 +-
 bolt/include/bolt/Core/CallGraph.h   | 2 +-
 bolt/include/bolt/Core/CallGraphWalker.h | 2 +-
 bolt/lib/Core/BinaryFunctionCallGraph.cpp| 2 +-
 bolt/lib/Core/CallGraph.cpp  | 2 +-
 bolt/lib/Core/CallGraphWalker.cpp| 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryFunctionCallGraph.h 
b/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
index 4579c33985254..4ff5b1b94c5e5 100644
--- a/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
+++ b/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.h --*- C++ 
-*-===//
+//===- bolt/Core/CallGraph.h --*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/include/bolt/Core/CallGraph.h 
b/bolt/include/bolt/Core/CallGraph.h
index bdbc50bb78e87..2fc18e61afcaa 100644
--- a/bolt/include/bolt/Core/CallGraph.h
+++ b/bolt/include/bolt/Core/CallGraph.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.h --*- C++ 
-*-===//
+//===- bolt/Core/CallGraph.h --*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/include/bolt/Core/CallGraphWalker.h 
b/bolt/include/bolt/Core/CallGraphWalker.h
index ac45644be362f..b0a73aee14369 100644
--- a/bolt/include/bolt/Core/CallGraphWalker.h
+++ b/bolt/include/bolt/Core/CallGraphWalker.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraphWalker.h *- C++ 
-*-===//
+//===- bolt/Core/CallGraphWalker.h *- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/BinaryFunctionCallGraph.cpp 
b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
index 86a31188c854a..b4b7897aa426a 100644
--- a/bolt/lib/Core/BinaryFunctionCallGraph.cpp
+++ b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/BinaryFunctionCallGraph.cpp 
===//
+//===- bolt/Core/BinaryFunctionCallGraph.cpp ===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/CallGraph.cpp b/bolt/lib/Core/CallGraph.cpp
index a7ea64fbbcf58..5f6bd11e9e97a 100644
--- a/bolt/lib/Core/CallGraph.cpp
+++ b/bolt/lib/Core/CallGraph.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.cpp 
--===//
+//===- bolt/Core/CallGraph.cpp --===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/CallGraphWalker.cpp 
b/bolt/lib/Core/CallGraphWalker.cpp
index 9d0087f79d17f..cbfa178d8e068 100644
--- a/bolt/lib/Core/CallGraphWalker.cpp
+++ b/bolt/lib/Core/CallGraphWalker.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraphWalker.cpp 
===//
+//===- bolt/Core/CallGraphWalker.cpp ===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

>From 1980f9bafd39ec84e71d71bd6a50d3368e1dbbe4 Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Thu, 27 Jun 2024 11:27:43 -0700
Subject: [PATCH 2/2] clang-format

Created using spr 1.3.4
---
 bolt/lib/Core/CallGraph.cpp| 14 +++---
 bolt/lib/Passes/FrameAnalysis.cpp  |  2 +-
 bolt/lib/Passes/FrameOptimizer.cpp |  2 +-
 bolt/lib/Passes/RegReAssign.cpp|  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/bolt/lib/Core/CallGraph.cpp b/bolt/lib/Core/CallGraph.cpp
index 5f6bd11e9e97a..f1d52737bf556 100644
--- a/bolt/lib/Core/CallGraph.cpp
+++ b/bolt/lib/Core/CallGraph.cpp
@@ -15,11 +15,11 @@
 #define DEBUG_TYPE "callgraph"
 
 #if defined(__x86_64__) && !defined(_MSC_VER)
-#  if (!defined USE_SSECRC)
-#define USE_SSECRC
-#  endif
+#if (!defined USE_SSECRC)
+#define USE_SSECRC
+#endif
 #else
-#  undef USE_SSECRC
+#undef USE_SSECRC
 #endif
 
 static LLVM_ATTRIBUTE_UNUSED inline size_t hash_int64_fallback(int64_t k) {
@@ -50,7 +50,7 @@ static inline size_t hash_int6

[llvm-branch-commits] [llvm] [BOLT][NFC] Move CallGraph from Passes to Core (PR #96922)

2024-06-27 Thread shaw young via llvm-branch-commits

https://github.com/shawbyoung updated 
https://github.com/llvm/llvm-project/pull/96922

>From 84a2f69e71372891e2721552b10e0105b9430257 Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Thu, 27 Jun 2024 09:28:22 -0700
Subject: [PATCH 1/2] Updated file headers

Created using spr 1.3.4
---
 bolt/include/bolt/Core/BinaryFunctionCallGraph.h | 2 +-
 bolt/include/bolt/Core/CallGraph.h   | 2 +-
 bolt/include/bolt/Core/CallGraphWalker.h | 2 +-
 bolt/lib/Core/BinaryFunctionCallGraph.cpp| 2 +-
 bolt/lib/Core/CallGraph.cpp  | 2 +-
 bolt/lib/Core/CallGraphWalker.cpp| 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryFunctionCallGraph.h 
b/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
index 4579c33985254..4ff5b1b94c5e5 100644
--- a/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
+++ b/bolt/include/bolt/Core/BinaryFunctionCallGraph.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.h --*- C++ 
-*-===//
+//===- bolt/Core/CallGraph.h --*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/include/bolt/Core/CallGraph.h 
b/bolt/include/bolt/Core/CallGraph.h
index bdbc50bb78e87..2fc18e61afcaa 100644
--- a/bolt/include/bolt/Core/CallGraph.h
+++ b/bolt/include/bolt/Core/CallGraph.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.h --*- C++ 
-*-===//
+//===- bolt/Core/CallGraph.h --*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/include/bolt/Core/CallGraphWalker.h 
b/bolt/include/bolt/Core/CallGraphWalker.h
index ac45644be362f..b0a73aee14369 100644
--- a/bolt/include/bolt/Core/CallGraphWalker.h
+++ b/bolt/include/bolt/Core/CallGraphWalker.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraphWalker.h *- C++ 
-*-===//
+//===- bolt/Core/CallGraphWalker.h *- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/BinaryFunctionCallGraph.cpp 
b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
index 86a31188c854a..b4b7897aa426a 100644
--- a/bolt/lib/Core/BinaryFunctionCallGraph.cpp
+++ b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/BinaryFunctionCallGraph.cpp 
===//
+//===- bolt/Core/BinaryFunctionCallGraph.cpp ===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/CallGraph.cpp b/bolt/lib/Core/CallGraph.cpp
index a7ea64fbbcf58..5f6bd11e9e97a 100644
--- a/bolt/lib/Core/CallGraph.cpp
+++ b/bolt/lib/Core/CallGraph.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraph.cpp 
--===//
+//===- bolt/Core/CallGraph.cpp --===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/CallGraphWalker.cpp 
b/bolt/lib/Core/CallGraphWalker.cpp
index 9d0087f79d17f..cbfa178d8e068 100644
--- a/bolt/lib/Core/CallGraphWalker.cpp
+++ b/bolt/lib/Core/CallGraphWalker.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/CallGraphWalker.cpp 
===//
+//===- bolt/Core/CallGraphWalker.cpp ===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

>From 1980f9bafd39ec84e71d71bd6a50d3368e1dbbe4 Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Thu, 27 Jun 2024 11:27:43 -0700
Subject: [PATCH 2/2] clang-format

Created using spr 1.3.4
---
 bolt/lib/Core/CallGraph.cpp| 14 +++---
 bolt/lib/Passes/FrameAnalysis.cpp  |  2 +-
 bolt/lib/Passes/FrameOptimizer.cpp |  2 +-
 bolt/lib/Passes/RegReAssign.cpp|  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/bolt/lib/Core/CallGraph.cpp b/bolt/lib/Core/CallGraph.cpp
index 5f6bd11e9e97a..f1d52737bf556 100644
--- a/bolt/lib/Core/CallGraph.cpp
+++ b/bolt/lib/Core/CallGraph.cpp
@@ -15,11 +15,11 @@
 #define DEBUG_TYPE "callgraph"
 
 #if defined(__x86_64__) && !defined(_MSC_VER)
-#  if (!defined USE_SSECRC)
-#define USE_SSECRC
-#  endif
+#if (!defined USE_SSECRC)
+#define USE_SSECRC
+#endif
 #else
-#  undef USE_SSECRC
+#undef USE_SSECRC
 #endif
 
 static LLVM_ATTRIBUTE_UNUSED inline size_t hash_int64_fallback(int64_t k) {
@@ -50,7 +50,7 @@ static inline size_t hash_int6

[llvm-branch-commits] [llvm] [Hashing] Use a non-deterministic seed if LLVM_ENABLE_ABI_BREAKING_CHECKS (PR #96282)

2024-06-27 Thread Fangrui Song via llvm-branch-commits


@@ -322,24 +306,20 @@ struct hash_state {
   }
 };
 
-
-/// A global, fixed seed-override variable.
-///
-/// This variable can be set using the \see llvm::set_fixed_execution_seed
-/// function. See that function for details. Do not, under any circumstances,
-/// set or read this variable.
-extern uint64_t fixed_seed_override;
-
+/// In LLVM_ENABLE_ABI_BREAKING_CHECKS builds, the seed is non-deterministic
+/// (address of a variable) to prevent having users depend on the particular
+/// hash values. On platforms without ASLR, this is still likely
+/// non-deterministic per build.
 inline uint64_t get_execution_seed() {
-  // FIXME: This needs to be a per-execution seed. This is just a placeholder
-  // implementation. Switching to a per-execution seed is likely to flush out
-  // instability bugs and so will happen as its own commit.
-  //
-  // However, if there is a fixed seed override set the first time this is
-  // called, return that instead of the per-execution seed.
-  const uint64_t seed_prime = 0xff51afd7ed558ccdULL;
-  static uint64_t seed = fixed_seed_override ? fixed_seed_override : 
seed_prime;
-  return seed;
+  // Work around x86-64 negative offset folding for old Clang -fno-pic
+  // https://reviews.llvm.org/D93931
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS && 
\
+(!defined(__clang__) || __clang_major__ > 11)

MaskRay wrote:

The clang condition introduced a slight ABI problem when mixing `clang<11` and 
`clang>=11`. In practice it is rare that the llvm-project build and a 
downstream client exchange the hash values. 

https://github.com/llvm/llvm-project/pull/96282
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] 62d7d56 - Revert "[Clang] Fix __is_trivially_equality_comparable returning true with in…"

2024-06-27 Thread via llvm-branch-commits

Author: Zequan Wu
Date: 2024-06-28T00:36:19-04:00
New Revision: 62d7d5611e70682f8743e7322e34204480ffe189

URL: 
https://github.com/llvm/llvm-project/commit/62d7d5611e70682f8743e7322e34204480ffe189
DIFF: 
https://github.com/llvm/llvm-project/commit/62d7d5611e70682f8743e7322e34204480ffe189.diff

LOG: Revert "[Clang] Fix __is_trivially_equality_comparable returning true with 
in…"

This reverts commit 5b363483cf2461617fbb2449491c9914811c8d53.

Added: 


Modified: 
clang/docs/ReleaseNotes.rst
clang/include/clang/AST/Type.h
clang/lib/AST/Type.cpp
clang/lib/Sema/SemaExprCXX.cpp
clang/test/SemaCXX/type-traits.cpp

Removed: 




diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7ebfc87144269..da967fcdda808 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -104,7 +104,7 @@ ABI Changes in This Version
   ifuncs. Its purpose was to preserve backwards compatibility when the ".ifunc"
   suffix got removed from the name mangling. The alias interacts badly with
   GlobalOpt (see the issue #96197).
-
+  
 - Fixed Microsoft name mangling for auto non-type template arguments of pointer
   type for MSVC 1920+. This change resolves incompatibilities with code 
compiled
   by MSVC 1920+ but will introduce incompatibilities with code compiled by
@@ -740,9 +740,6 @@ Bug Fixes in This Version
   negatives where the analysis failed to detect unchecked access to guarded
   data.
 
-- ``__is_trivially_equality_comparable`` no longer returns true for types which
-  have a constrained defaulted comparison operator (#GH89293).
-
 Bug Fixes to Compiler Builtins
 ^^
 

diff  --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index a98899f7f4222..62836ec5c6312 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -1142,6 +1142,9 @@ class QualType {
   /// Return true if this is a trivially relocatable type.
   bool isTriviallyRelocatableType(const ASTContext &Context) const;
 
+  /// Return true if this is a trivially equality comparable type.
+  bool isTriviallyEqualityComparableType(const ASTContext &Context) const;
+
   /// Returns true if it is a class and it might be dynamic.
   bool mayBeDynamicClass() const;
 

diff  --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index cc535aba4936e..d8b885870de3a 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2815,6 +2815,66 @@ bool QualType::isTriviallyRelocatableType(const 
ASTContext &Context) const {
   }
 }
 
+static bool
+HasNonDeletedDefaultedEqualityComparison(const CXXRecordDecl *Decl) {
+  if (Decl->isUnion())
+return false;
+  if (Decl->isLambda())
+return Decl->isCapturelessLambda();
+
+  auto IsDefaultedOperatorEqualEqual = [&](const FunctionDecl *Function) {
+return Function->getOverloadedOperator() ==
+   OverloadedOperatorKind::OO_EqualEqual &&
+   Function->isDefaulted() && Function->getNumParams() > 0 &&
+   (Function->getParamDecl(0)->getType()->isReferenceType() ||
+Decl->isTriviallyCopyable());
+  };
+
+  if (llvm::none_of(Decl->methods(), IsDefaultedOperatorEqualEqual) &&
+  llvm::none_of(Decl->friends(), [&](const FriendDecl *Friend) {
+if (NamedDecl *ND = Friend->getFriendDecl()) {
+  return ND->isFunctionOrFunctionTemplate() &&
+ IsDefaultedOperatorEqualEqual(ND->getAsFunction());
+}
+return false;
+  }))
+return false;
+
+  return llvm::all_of(Decl->bases(),
+  [](const CXXBaseSpecifier &BS) {
+if (const auto *RD = 
BS.getType()->getAsCXXRecordDecl())
+  return HasNonDeletedDefaultedEqualityComparison(RD);
+return true;
+  }) &&
+ llvm::all_of(Decl->fields(), [](const FieldDecl *FD) {
+   auto Type = FD->getType();
+   if (Type->isArrayType())
+ Type = 
Type->getBaseElementTypeUnsafe()->getCanonicalTypeUnqualified();
+
+   if (Type->isReferenceType() || Type->isEnumeralType())
+ return false;
+   if (const auto *RD = Type->getAsCXXRecordDecl())
+ return HasNonDeletedDefaultedEqualityComparison(RD);
+   return true;
+ });
+}
+
+bool QualType::isTriviallyEqualityComparableType(
+const ASTContext &Context) const {
+  QualType CanonicalType = getCanonicalType();
+  if (CanonicalType->isIncompleteType() || CanonicalType->isDependentType() ||
+  CanonicalType->isEnumeralType() || CanonicalType->isArrayType())
+return false;
+
+  if (const auto *RD = CanonicalType->getAsCXXRecordDecl()) {
+if (!HasNonDeletedDefaultedEqualityComparison(RD))
+  return false;
+  }
+
+  return Context.hasUniqueObjectRepresentations(
+  CanonicalType, /*CheckIfTriviallyCopyable=*/false);
+}
+
 boo

[llvm-branch-commits] [libcxx] a2fb2a1 - Revert "[libc++][NFC] Simplify pair a bit (#96165)"

2024-06-27 Thread via llvm-branch-commits

Author: James Y Knight
Date: 2024-06-28T01:02:40-04:00
New Revision: a2fb2a16f3d93364b8eaae82db443eb354299158

URL: 
https://github.com/llvm/llvm-project/commit/a2fb2a16f3d93364b8eaae82db443eb354299158
DIFF: 
https://github.com/llvm/llvm-project/commit/a2fb2a16f3d93364b8eaae82db443eb354299158.diff

LOG: Revert "[libc++][NFC] Simplify pair a bit (#96165)"

This reverts commit 54cb5ca9f48fc542b920662a0eee7c0e6f35bee0.

Added: 


Modified: 
libcxx/include/__utility/pair.h

Removed: 




diff  --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index 1b0d9241886f9..0afbebcdc9f2a 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -16,6 +16,8 @@
 #include <__fwd/array.h>
 #include <__fwd/pair.h>
 #include <__fwd/tuple.h>
+#include <__tuple/sfinae_helpers.h>
+#include <__tuple/tuple_element.h>
 #include <__tuple/tuple_indices.h>
 #include <__tuple/tuple_like_no_subrange.h>
 #include <__tuple/tuple_size.h>
@@ -128,15 +130,19 @@ struct _LIBCPP_TEMPLATE_VIS pair
 }
   };
 
-  template  = 0>
-  explicit(!_CheckArgs::__enable_implicit_default()) _LIBCPP_HIDE_FROM_ABI 
constexpr pair() noexcept(
+  template 
+  using _CheckArgsDep _LIBCPP_NODEBUG =
+  typename conditional< _MaybeEnable, _CheckArgs, 
__check_tuple_constructor_fail>::type;
+
+  template ::__enable_default(), int> = 0>
+  explicit(!_CheckArgsDep<_Dummy>::__enable_implicit_default()) 
_LIBCPP_HIDE_FROM_ABI constexpr pair() noexcept(
   is_nothrow_default_constructible::value && 
is_nothrow_default_constructible::value)
   : first(), second() {}
 
-  template (), int> = 0>
+  template ::template 
__is_pair_constructible<_T1 const&, _T2 const&>(), int> = 0>
   _LIBCPP_HIDE_FROM_ABI
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(!_CheckArgs::template 
__is_implicit<_T1 const&, _T2 const&>())
+  _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(!_CheckArgsDep<_Dummy>::template 
__is_implicit<_T1 const&, _T2 const&>())
   pair(_T1 const& __t1, _T2 const& __t2) 
noexcept(is_nothrow_copy_constructible::value &&
   
is_nothrow_copy_constructible::value)
   : first(__t1), second(__t2) {}



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits