[llvm-branch-commits] [clang] 97a9ef1 - Revert "[Clang][C++26] Implement "Ordering of constraints involving fold expr…"

2024-07-16 Thread via llvm-branch-commits

Author: cor3ntin
Date: 2024-07-16T10:59:38+02:00
New Revision: 97a9ef1c1810b88102d097d334b2017cce782390

URL: 
https://github.com/llvm/llvm-project/commit/97a9ef1c1810b88102d097d334b2017cce782390
DIFF: 
https://github.com/llvm/llvm-project/commit/97a9ef1c1810b88102d097d334b2017cce782390.diff

LOG: Revert "[Clang][C++26] Implement "Ordering of constraints involving fold 
expr…"

This reverts commit 244892735941a455506ae38ae0fb40cf80cdb351.

Added: 


Modified: 
clang/docs/ReleaseNotes.rst
clang/include/clang/Sema/Sema.h
clang/include/clang/Sema/SemaConcept.h
clang/lib/Sema/SemaConcept.cpp
clang/lib/Sema/SemaTemplateVariadic.cpp
clang/www/cxx_status.html

Removed: 
clang/test/SemaCXX/cxx2c-fold-exprs.cpp



diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index cb35825b71e3e..969856a8f978c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -278,9 +278,6 @@ C++2c Feature Support
 
 - Implemented `P3144R2 Deleting a Pointer to an Incomplete Type Should be 
Ill-formed `_.
 
-- Implemented `P2963R3 Ordering of constraints involving fold expressions 
`_.
-
-
 Resolutions to C++ Defect Reports
 ^
 - Substitute template parameter pack, when it is not explicitly specified

diff  --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 3cb1aa935fe46..48dff1b76cc57 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -14078,11 +14078,6 @@ class Sema final : public SemaBase {
   const DeclarationNameInfo &NameInfo,
   SmallVectorImpl &Unexpanded);
 
-  /// Collect the set of unexpanded parameter packs within the given
-  /// expression.
-  static void collectUnexpandedParameterPacks(
-  Expr *E, SmallVectorImpl &Unexpanded);
-
   /// Invoked when parsing a template argument followed by an
   /// ellipsis, which creates a pack expansion.
   ///

diff  --git a/clang/include/clang/Sema/SemaConcept.h 
b/clang/include/clang/Sema/SemaConcept.h
index 8fb7dd6838e57..711443505174f 100644
--- a/clang/include/clang/Sema/SemaConcept.h
+++ b/clang/include/clang/Sema/SemaConcept.h
@@ -75,26 +75,6 @@ struct AtomicConstraint {
   }
 };
 
-struct FoldExpandedConstraint;
-
-using NormalFormConstraint =
-llvm::PointerUnion;
-struct NormalizedConstraint;
-using NormalForm =
-llvm::SmallVector, 4>;
-
-// A constraint is in conjunctive normal form when it is a conjunction of
-// clauses where each clause is a disjunction of atomic constraints. For atomic
-// constraints A, B, and C, the constraint A  ∧ (B  ∨ C) is in conjunctive
-// normal form.
-NormalForm makeCNF(const NormalizedConstraint &Normalized);
-
-// A constraint is in disjunctive normal form when it is a disjunction of
-// clauses where each clause is a conjunction of atomic constraints. For atomic
-// constraints A, B, and C, the disjunctive normal form of the constraint A
-//  ∧ (B  ∨ C) is (A  ∧ B)  ∨ (A  ∧ C).
-NormalForm makeDNF(const NormalizedConstraint &Normalized);
-
 /// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
 /// either an atomic constraint, a conjunction of normalized constraints or a
 /// disjunction of normalized constraints.
@@ -107,17 +87,26 @@ struct NormalizedConstraint {
   std::pair *, 1,
   CompoundConstraintKind>;
 
-  llvm::PointerUnion
-  Constraint;
+  llvm::PointerUnion Constraint;
 
   NormalizedConstraint(AtomicConstraint *C): Constraint{C} { };
-  NormalizedConstraint(FoldExpandedConstraint *C) : Constraint{C} {};
-
   NormalizedConstraint(ASTContext &C, NormalizedConstraint LHS,
-   NormalizedConstraint RHS, CompoundConstraintKind Kind);
-
-  NormalizedConstraint(ASTContext &C, const NormalizedConstraint &Other);
+   NormalizedConstraint RHS, CompoundConstraintKind Kind)
+  : Constraint{CompoundConstraint{
+new (C) std::pair{
+std::move(LHS), std::move(RHS)}, Kind}} { };
+
+  NormalizedConstraint(ASTContext &C, const NormalizedConstraint &Other) {
+if (Other.isAtomic()) {
+  Constraint = new (C) AtomicConstraint(*Other.getAtomicConstraint());
+} else {
+  Constraint = CompoundConstraint(
+  new (C) std::pair{
+  NormalizedConstraint(C, Other.getLHS()),
+  NormalizedConstraint(C, Other.getRHS())},
+  Other.getCompoundKind());
+}
+  }
   NormalizedConstraint(NormalizedConstraint &&Other):
   Constraint(Other.Constraint) {
 Other.Constraint = nullptr;
@@ -131,24 +120,20 @@ struct NormalizedConstraint {
 return *this;
   }
 
-  bool isAtomic() const { return Constraint.is(); }
-  bool isFoldExpanded() const {
-return Constraint.is();
-  }
-  bool isCompound() const { return Constraint.is(); }
-
   CompoundConstraintKind getCompoundKind() 

[llvm-branch-commits] [llvm] [BOLT] Match functions with call graph (PR #98125)

2024-07-16 Thread Shaw Young via llvm-branch-commits

https://github.com/shawbyoung updated 
https://github.com/llvm/llvm-project/pull/98125

>From cf32a43e7c2b04079c6123fe13df4fb7226d771f Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Tue, 9 Jul 2024 10:04:25 -0700
Subject: [PATCH 01/11] Comments

Created using spr 1.3.4
---
 bolt/lib/Profile/YAMLProfileReader.cpp | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp 
b/bolt/lib/Profile/YAMLProfileReader.cpp
index 69ea0899c5f2c..6753337c24ea7 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -501,7 +501,6 @@ size_t YAMLProfileReader::matchWithCallGraph(BinaryContext 
&BC) {
 
   // Maps binary functions to adjacent functions in the FCG.
   for (const BinaryFunction *CallerBF : BFs) {
-// Add all call targets to the hash map.
 for (const BinaryBasicBlock &BB : CallerBF->blocks()) {
   for (const MCInst &Inst : BB) {
 if (!BC.MIB->isCall(Instr))
@@ -533,7 +532,8 @@ size_t YAMLProfileReader::matchWithCallGraph(BinaryContext 
&BC) {
 }
   }
 
-  // Create mapping from neighbor hash to BFs.
+  // Using the constructed adjacent function mapping, creates mapping from
+  // neighbor hash to BFs.
   std::unordered_map>
   NeighborHashToBFs;
   for (const BinaryFunction *BF : BFs) {
@@ -552,12 +552,12 @@ size_t 
YAMLProfileReader::matchWithCallGraph(BinaryContext &BC) {
 .push_back(BF);
   }
 
-  // TODO: change call anchor PR to have this representation - we need it here
+  // TODO: note, this will be introduced in the matching functions with calls
+  // as anchors pr
   DenseMap
   IdToYAMLBF;
-  // TODO: change call anchor PR to have this representation - we need it here
 
-  // Maps hashes to profiled functions.
+  // Maps YAML functions to adjacent functions in the profile FCG.
   std::unordered_map
   YamlBFToHashes(BFs.size());
@@ -590,7 +590,7 @@ size_t YAMLProfileReader::matchWithCallGraph(BinaryContext 
&BC) {
 }
   }
 
-  // Matching YAMLBF with neighbor hashes.
+  // Matches YAMLBF to BFs with neighbor hashes.
   for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
 if (YamlBF.Used)
   continue;

>From ee9049fc4bd3d4203c19c9c0982a78ab3b47666f Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Tue, 9 Jul 2024 13:52:05 -0700
Subject: [PATCH 02/11] Moved blended hash definition

Created using spr 1.3.4
---
 bolt/include/bolt/Profile/YAMLProfileReader.h |  69 ++-
 bolt/lib/Profile/StaleProfileMatching.cpp |  65 ---
 bolt/lib/Profile/YAMLProfileReader.cpp| 110 --
 3 files changed, 119 insertions(+), 125 deletions(-)

diff --git a/bolt/include/bolt/Profile/YAMLProfileReader.h 
b/bolt/include/bolt/Profile/YAMLProfileReader.h
index 36e8f8739eee1..e8a34ecad9a08 100644
--- a/bolt/include/bolt/Profile/YAMLProfileReader.h
+++ b/bolt/include/bolt/Profile/YAMLProfileReader.h
@@ -16,6 +16,73 @@
 namespace llvm {
 namespace bolt {
 
+/// An object wrapping several components of a basic block hash. The combined
+/// (blended) hash is represented and stored as one uint64_t, while individual
+/// components are of smaller size (e.g., uint16_t or uint8_t).
+struct BlendedBlockHash {
+private:
+  using ValueOffset = Bitfield::Element;
+  using ValueOpcode = Bitfield::Element;
+  using ValueInstr = Bitfield::Element;
+  using ValuePred = Bitfield::Element;
+  using ValueSucc = Bitfield::Element;
+
+public:
+  explicit BlendedBlockHash() {}
+
+  explicit BlendedBlockHash(uint64_t Hash) {
+Offset = Bitfield::get(Hash);
+OpcodeHash = Bitfield::get(Hash);
+InstrHash = Bitfield::get(Hash);
+PredHash = Bitfield::get(Hash);
+SuccHash = Bitfield::get(Hash);
+  }
+
+  /// Combine the blended hash into uint64_t.
+  uint64_t combine() const {
+uint64_t Hash = 0;
+Bitfield::set(Hash, Offset);
+Bitfield::set(Hash, OpcodeHash);
+Bitfield::set(Hash, InstrHash);
+Bitfield::set(Hash, PredHash);
+Bitfield::set(Hash, SuccHash);
+return Hash;
+  }
+
+  /// Compute a distance between two given blended hashes. The smaller the
+  /// distance, the more similar two blocks are. For identical basic blocks,
+  /// the distance is zero.
+  uint64_t distance(const BlendedBlockHash &BBH) const {
+assert(OpcodeHash == BBH.OpcodeHash &&
+   "incorrect blended hash distance computation");
+uint64_t Dist = 0;
+// Account for NeighborHash
+Dist += SuccHash == BBH.SuccHash ? 0 : 1;
+Dist += PredHash == BBH.PredHash ? 0 : 1;
+Dist <<= 16;
+// Account for InstrHash
+Dist += InstrHash == BBH.InstrHash ? 0 : 1;
+Dist <<= 16;
+// Account for Offset
+Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset);
+return Dist;
+  }
+
+  /// The offset of the basic block from the function start.
+  uint16_t Offset{0};
+  /// (Loose) Hash of the basic block instructions, excluding operands.
+  uint16_t OpcodeHash{0};
+  /// (Str

[llvm-branch-commits] [llvm] [BOLT] Match functions with call graph (PR #98125)

2024-07-16 Thread Shaw Young via llvm-branch-commits

https://github.com/shawbyoung updated 
https://github.com/llvm/llvm-project/pull/98125

>From cf32a43e7c2b04079c6123fe13df4fb7226d771f Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Tue, 9 Jul 2024 10:04:25 -0700
Subject: [PATCH 01/11] Comments

Created using spr 1.3.4
---
 bolt/lib/Profile/YAMLProfileReader.cpp | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp 
b/bolt/lib/Profile/YAMLProfileReader.cpp
index 69ea0899c5f2c..6753337c24ea7 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -501,7 +501,6 @@ size_t YAMLProfileReader::matchWithCallGraph(BinaryContext 
&BC) {
 
   // Maps binary functions to adjacent functions in the FCG.
   for (const BinaryFunction *CallerBF : BFs) {
-// Add all call targets to the hash map.
 for (const BinaryBasicBlock &BB : CallerBF->blocks()) {
   for (const MCInst &Inst : BB) {
 if (!BC.MIB->isCall(Instr))
@@ -533,7 +532,8 @@ size_t YAMLProfileReader::matchWithCallGraph(BinaryContext 
&BC) {
 }
   }
 
-  // Create mapping from neighbor hash to BFs.
+  // Using the constructed adjacent function mapping, creates mapping from
+  // neighbor hash to BFs.
   std::unordered_map>
   NeighborHashToBFs;
   for (const BinaryFunction *BF : BFs) {
@@ -552,12 +552,12 @@ size_t 
YAMLProfileReader::matchWithCallGraph(BinaryContext &BC) {
 .push_back(BF);
   }
 
-  // TODO: change call anchor PR to have this representation - we need it here
+  // TODO: note, this will be introduced in the matching functions with calls
+  // as anchors pr
   DenseMap
   IdToYAMLBF;
-  // TODO: change call anchor PR to have this representation - we need it here
 
-  // Maps hashes to profiled functions.
+  // Maps YAML functions to adjacent functions in the profile FCG.
   std::unordered_map
   YamlBFToHashes(BFs.size());
@@ -590,7 +590,7 @@ size_t YAMLProfileReader::matchWithCallGraph(BinaryContext 
&BC) {
 }
   }
 
-  // Matching YAMLBF with neighbor hashes.
+  // Matches YAMLBF to BFs with neighbor hashes.
   for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
 if (YamlBF.Used)
   continue;

>From ee9049fc4bd3d4203c19c9c0982a78ab3b47666f Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Tue, 9 Jul 2024 13:52:05 -0700
Subject: [PATCH 02/11] Moved blended hash definition

Created using spr 1.3.4
---
 bolt/include/bolt/Profile/YAMLProfileReader.h |  69 ++-
 bolt/lib/Profile/StaleProfileMatching.cpp |  65 ---
 bolt/lib/Profile/YAMLProfileReader.cpp| 110 --
 3 files changed, 119 insertions(+), 125 deletions(-)

diff --git a/bolt/include/bolt/Profile/YAMLProfileReader.h 
b/bolt/include/bolt/Profile/YAMLProfileReader.h
index 36e8f8739eee1..e8a34ecad9a08 100644
--- a/bolt/include/bolt/Profile/YAMLProfileReader.h
+++ b/bolt/include/bolt/Profile/YAMLProfileReader.h
@@ -16,6 +16,73 @@
 namespace llvm {
 namespace bolt {
 
+/// An object wrapping several components of a basic block hash. The combined
+/// (blended) hash is represented and stored as one uint64_t, while individual
+/// components are of smaller size (e.g., uint16_t or uint8_t).
+struct BlendedBlockHash {
+private:
+  using ValueOffset = Bitfield::Element;
+  using ValueOpcode = Bitfield::Element;
+  using ValueInstr = Bitfield::Element;
+  using ValuePred = Bitfield::Element;
+  using ValueSucc = Bitfield::Element;
+
+public:
+  explicit BlendedBlockHash() {}
+
+  explicit BlendedBlockHash(uint64_t Hash) {
+Offset = Bitfield::get(Hash);
+OpcodeHash = Bitfield::get(Hash);
+InstrHash = Bitfield::get(Hash);
+PredHash = Bitfield::get(Hash);
+SuccHash = Bitfield::get(Hash);
+  }
+
+  /// Combine the blended hash into uint64_t.
+  uint64_t combine() const {
+uint64_t Hash = 0;
+Bitfield::set(Hash, Offset);
+Bitfield::set(Hash, OpcodeHash);
+Bitfield::set(Hash, InstrHash);
+Bitfield::set(Hash, PredHash);
+Bitfield::set(Hash, SuccHash);
+return Hash;
+  }
+
+  /// Compute a distance between two given blended hashes. The smaller the
+  /// distance, the more similar two blocks are. For identical basic blocks,
+  /// the distance is zero.
+  uint64_t distance(const BlendedBlockHash &BBH) const {
+assert(OpcodeHash == BBH.OpcodeHash &&
+   "incorrect blended hash distance computation");
+uint64_t Dist = 0;
+// Account for NeighborHash
+Dist += SuccHash == BBH.SuccHash ? 0 : 1;
+Dist += PredHash == BBH.PredHash ? 0 : 1;
+Dist <<= 16;
+// Account for InstrHash
+Dist += InstrHash == BBH.InstrHash ? 0 : 1;
+Dist <<= 16;
+// Account for Offset
+Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset);
+return Dist;
+  }
+
+  /// The offset of the basic block from the function start.
+  uint16_t Offset{0};
+  /// (Loose) Hash of the basic block instructions, excluding operands.
+  uint16_t OpcodeHash{0};
+  /// (Str

[llvm-branch-commits] [llvm] [BOLT] Match functions with call graph (PR #98125)

2024-07-16 Thread Shaw Young via llvm-branch-commits

https://github.com/shawbyoung updated 
https://github.com/llvm/llvm-project/pull/98125

>From cf32a43e7c2b04079c6123fe13df4fb7226d771f Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Tue, 9 Jul 2024 10:04:25 -0700
Subject: [PATCH 01/12] Comments

Created using spr 1.3.4
---
 bolt/lib/Profile/YAMLProfileReader.cpp | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp 
b/bolt/lib/Profile/YAMLProfileReader.cpp
index 69ea0899c5f2c..6753337c24ea7 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -501,7 +501,6 @@ size_t YAMLProfileReader::matchWithCallGraph(BinaryContext 
&BC) {
 
   // Maps binary functions to adjacent functions in the FCG.
   for (const BinaryFunction *CallerBF : BFs) {
-// Add all call targets to the hash map.
 for (const BinaryBasicBlock &BB : CallerBF->blocks()) {
   for (const MCInst &Inst : BB) {
 if (!BC.MIB->isCall(Instr))
@@ -533,7 +532,8 @@ size_t YAMLProfileReader::matchWithCallGraph(BinaryContext 
&BC) {
 }
   }
 
-  // Create mapping from neighbor hash to BFs.
+  // Using the constructed adjacent function mapping, creates mapping from
+  // neighbor hash to BFs.
   std::unordered_map>
   NeighborHashToBFs;
   for (const BinaryFunction *BF : BFs) {
@@ -552,12 +552,12 @@ size_t 
YAMLProfileReader::matchWithCallGraph(BinaryContext &BC) {
 .push_back(BF);
   }
 
-  // TODO: change call anchor PR to have this representation - we need it here
+  // TODO: note, this will be introduced in the matching functions with calls
+  // as anchors pr
   DenseMap
   IdToYAMLBF;
-  // TODO: change call anchor PR to have this representation - we need it here
 
-  // Maps hashes to profiled functions.
+  // Maps YAML functions to adjacent functions in the profile FCG.
   std::unordered_map
   YamlBFToHashes(BFs.size());
@@ -590,7 +590,7 @@ size_t YAMLProfileReader::matchWithCallGraph(BinaryContext 
&BC) {
 }
   }
 
-  // Matching YAMLBF with neighbor hashes.
+  // Matches YAMLBF to BFs with neighbor hashes.
   for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
 if (YamlBF.Used)
   continue;

>From ee9049fc4bd3d4203c19c9c0982a78ab3b47666f Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Tue, 9 Jul 2024 13:52:05 -0700
Subject: [PATCH 02/12] Moved blended hash definition

Created using spr 1.3.4
---
 bolt/include/bolt/Profile/YAMLProfileReader.h |  69 ++-
 bolt/lib/Profile/StaleProfileMatching.cpp |  65 ---
 bolt/lib/Profile/YAMLProfileReader.cpp| 110 --
 3 files changed, 119 insertions(+), 125 deletions(-)

diff --git a/bolt/include/bolt/Profile/YAMLProfileReader.h 
b/bolt/include/bolt/Profile/YAMLProfileReader.h
index 36e8f8739eee1..e8a34ecad9a08 100644
--- a/bolt/include/bolt/Profile/YAMLProfileReader.h
+++ b/bolt/include/bolt/Profile/YAMLProfileReader.h
@@ -16,6 +16,73 @@
 namespace llvm {
 namespace bolt {
 
+/// An object wrapping several components of a basic block hash. The combined
+/// (blended) hash is represented and stored as one uint64_t, while individual
+/// components are of smaller size (e.g., uint16_t or uint8_t).
+struct BlendedBlockHash {
+private:
+  using ValueOffset = Bitfield::Element;
+  using ValueOpcode = Bitfield::Element;
+  using ValueInstr = Bitfield::Element;
+  using ValuePred = Bitfield::Element;
+  using ValueSucc = Bitfield::Element;
+
+public:
+  explicit BlendedBlockHash() {}
+
+  explicit BlendedBlockHash(uint64_t Hash) {
+Offset = Bitfield::get(Hash);
+OpcodeHash = Bitfield::get(Hash);
+InstrHash = Bitfield::get(Hash);
+PredHash = Bitfield::get(Hash);
+SuccHash = Bitfield::get(Hash);
+  }
+
+  /// Combine the blended hash into uint64_t.
+  uint64_t combine() const {
+uint64_t Hash = 0;
+Bitfield::set(Hash, Offset);
+Bitfield::set(Hash, OpcodeHash);
+Bitfield::set(Hash, InstrHash);
+Bitfield::set(Hash, PredHash);
+Bitfield::set(Hash, SuccHash);
+return Hash;
+  }
+
+  /// Compute a distance between two given blended hashes. The smaller the
+  /// distance, the more similar two blocks are. For identical basic blocks,
+  /// the distance is zero.
+  uint64_t distance(const BlendedBlockHash &BBH) const {
+assert(OpcodeHash == BBH.OpcodeHash &&
+   "incorrect blended hash distance computation");
+uint64_t Dist = 0;
+// Account for NeighborHash
+Dist += SuccHash == BBH.SuccHash ? 0 : 1;
+Dist += PredHash == BBH.PredHash ? 0 : 1;
+Dist <<= 16;
+// Account for InstrHash
+Dist += InstrHash == BBH.InstrHash ? 0 : 1;
+Dist <<= 16;
+// Account for Offset
+Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset);
+return Dist;
+  }
+
+  /// The offset of the basic block from the function start.
+  uint16_t Offset{0};
+  /// (Loose) Hash of the basic block instructions, excluding operands.
+  uint16_t OpcodeHash{0};
+  /// (Str

[llvm-branch-commits] [llvm] ddc8e9d - Revert "[PatternMatch] Fix issue of stale reference in new `m_{I,F,}Cmp` matc…"

2024-07-16 Thread via llvm-branch-commits

Author: Vitaly Buka
Date: 2024-07-16T09:53:13-07:00
New Revision: ddc8e9d34306177966a3f46e8eefc3d4a0c7c8ef

URL: 
https://github.com/llvm/llvm-project/commit/ddc8e9d34306177966a3f46e8eefc3d4a0c7c8ef
DIFF: 
https://github.com/llvm/llvm-project/commit/ddc8e9d34306177966a3f46e8eefc3d4a0c7c8ef.diff

LOG: Revert "[PatternMatch] Fix issue of stale reference in new `m_{I,F,}Cmp` 
matc…"

This reverts commit e027017337cc8ae6ed03dc2a3d1c9903ea2f33b2.

Added: 


Modified: 
llvm/include/llvm/IR/PatternMatch.h
llvm/unittests/IR/PatternMatch.cpp

Removed: 




diff  --git a/llvm/include/llvm/IR/PatternMatch.h 
b/llvm/include/llvm/IR/PatternMatch.h
index bea1ad97ea09c..8ae47fb556b25 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1550,27 +1550,23 @@ template  inline Exact_match 
m_Exact(const T &SubPattern) {
 template 
 struct CmpClass_match {
-  PredicateTy *Predicate;
+  PredicateTy &Predicate;
   LHS_t L;
   RHS_t R;
 
   // The evaluation order is always stable, regardless of Commutability.
   // The LHS is always matched first.
   CmpClass_match(PredicateTy &Pred, const LHS_t &LHS, const RHS_t &RHS)
-  : Predicate(&Pred), L(LHS), R(RHS) {}
-  CmpClass_match(const LHS_t &LHS, const RHS_t &RHS)
-  : Predicate(nullptr), L(LHS), R(RHS) {}
+  : Predicate(Pred), L(LHS), R(RHS) {}
 
   template  bool match(OpTy *V) {
 if (auto *I = dyn_cast(V)) {
   if (L.match(I->getOperand(0)) && R.match(I->getOperand(1))) {
-if (Predicate)
-  *Predicate = I->getPredicate();
+Predicate = I->getPredicate();
 return true;
   } else if (Commutable && L.match(I->getOperand(1)) &&
  R.match(I->getOperand(0))) {
-if (Predicate)
-  *Predicate = I->getSwappedPredicate();
+Predicate = I->getSwappedPredicate();
 return true;
   }
 }
@@ -1599,19 +1595,22 @@ m_FCmp(FCmpInst::Predicate &Pred, const LHS &L, const 
RHS &R) {
 template 
 inline CmpClass_match
 m_Cmp(const LHS &L, const RHS &R) {
-  return CmpClass_match(L, R);
+  CmpInst::Predicate Unused;
+  return CmpClass_match(Unused, L, R);
 }
 
 template 
 inline CmpClass_match
 m_ICmp(const LHS &L, const RHS &R) {
-  return CmpClass_match(L, R);
+  ICmpInst::Predicate Unused;
+  return CmpClass_match(Unused, L, R);
 }
 
 template 
 inline CmpClass_match
 m_FCmp(const LHS &L, const RHS &R) {
-  return CmpClass_match(L, R);
+  FCmpInst::Predicate Unused;
+  return CmpClass_match(Unused, L, R);
 }
 
 // Same as CmpClass, but instead of saving Pred as out output variable, match a

diff  --git a/llvm/unittests/IR/PatternMatch.cpp 
b/llvm/unittests/IR/PatternMatch.cpp
index 309fcc93996bc..b82711ec244a6 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -2235,7 +2235,7 @@ typedef ::testing::Types,
 MutableConstTestTypes;
 TYPED_TEST_SUITE(MutableConstTest, MutableConstTestTypes, );
 
-TYPED_TEST(MutableConstTest, ICmp) {
+TYPED_TEST(MutableConstTest, /* FIXME: UAR bug */ DISABLED_ICmp) {
   auto &IRB = PatternMatchTest::IRB;
 
   typedef std::tuple_element_t<0, TypeParam> ValueType;
@@ -2319,7 +2319,7 @@ TYPED_TEST(MutableConstTest, ICmp) {
.match((InstructionType)IRB.CreateICmp(Pred, L, R)));
 }
 
-TYPED_TEST(MutableConstTest, FCmp) {
+TYPED_TEST(MutableConstTest, /* FIXME: UAR bug */ DISABLED_FCmp) {
   auto &IRB = PatternMatchTest::IRB;
 
   typedef std::tuple_element_t<0, TypeParam> ValueType;



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] 7f876ac - Revert "Finish deleting the le32/le64 targets (#98497)"

2024-07-16 Thread via llvm-branch-commits

Author: Aaron Ballman
Date: 2024-07-16T14:46:21-04:00
New Revision: 7f876acbe41766b20f8fd9df77a00bdb4a4cc30c

URL: 
https://github.com/llvm/llvm-project/commit/7f876acbe41766b20f8fd9df77a00bdb4a4cc30c
DIFF: 
https://github.com/llvm/llvm-project/commit/7f876acbe41766b20f8fd9df77a00bdb4a4cc30c.diff

LOG: Revert "Finish deleting the le32/le64 targets (#98497)"

This reverts commit 2369a54fbeb61f965a3a425e660c878ae8b962c3.

Added: 
clang/lib/Basic/Targets/Le64.cpp
clang/lib/Basic/Targets/Le64.h

Modified: 
clang/docs/ReleaseNotes.rst
clang/docs/tools/clang-formatted-files.txt
clang/lib/Basic/CMakeLists.txt
clang/lib/Basic/Targets.cpp
clang/lib/Basic/Targets/OSTargets.h
clang/lib/CodeGen/CodeGenModule.cpp
clang/lib/CodeGen/ItaniumCXXABI.cpp
clang/lib/Driver/ToolChains/Clang.cpp
clang/test/CodeGen/bitfield-access-pad.c
clang/test/CodeGen/bitfield-access-unit.c
clang/test/CodeGenCXX/bitfield-access-empty.cpp
clang/test/CodeGenCXX/bitfield-access-tail.cpp
clang/test/Preprocessor/predefined-macros-no-warnings.c
llvm/include/llvm/TargetParser/Triple.h
llvm/lib/TargetParser/Triple.cpp
llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn

Removed: 




diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e51dc8d76ac0d..d0138d6b00017 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -40,8 +40,6 @@ code bases.
 - Setting the deprecated CMake variable ``GCC_INSTALL_PREFIX`` (which sets the
   default ``--gcc-toolchain=``) now leads to a fatal error.
 
-- The ``le32`` and ``le64`` targets have been removed.
-
 C/C++ Language Potentially Breaking Changes
 ---
 

diff  --git a/clang/docs/tools/clang-formatted-files.txt 
b/clang/docs/tools/clang-formatted-files.txt
index 62871133a6807..a8ee8f1fcb87c 100644
--- a/clang/docs/tools/clang-formatted-files.txt
+++ b/clang/docs/tools/clang-formatted-files.txt
@@ -362,6 +362,7 @@ clang/lib/Basic/Targets/BPF.cpp
 clang/lib/Basic/Targets/BPF.h
 clang/lib/Basic/Targets/Hexagon.h
 clang/lib/Basic/Targets/Lanai.h
+clang/lib/Basic/Targets/Le64.h
 clang/lib/Basic/Targets/M68k.h
 clang/lib/Basic/Targets/MSP430.h
 clang/lib/Basic/Targets/NVPTX.cpp

diff  --git a/clang/lib/Basic/CMakeLists.txt b/clang/lib/Basic/CMakeLists.txt
index e7ebc8f191aa6..f30680552e0f5 100644
--- a/clang/lib/Basic/CMakeLists.txt
+++ b/clang/lib/Basic/CMakeLists.txt
@@ -102,6 +102,7 @@ add_clang_library(clangBasic
   Targets/DirectX.cpp
   Targets/Hexagon.cpp
   Targets/Lanai.cpp
+  Targets/Le64.cpp
   Targets/LoongArch.cpp
   Targets/M68k.cpp
   Targets/MSP430.cpp

diff  --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index 0b8e565345b6a..29133f9ee8fce 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -23,6 +23,7 @@
 #include "Targets/DirectX.h"
 #include "Targets/Hexagon.h"
 #include "Targets/Lanai.h"
+#include "Targets/Le64.h"
 #include "Targets/LoongArch.h"
 #include "Targets/M68k.h"
 #include "Targets/MSP430.h"
@@ -343,6 +344,17 @@ std::unique_ptr AllocateTarget(const 
llvm::Triple &Triple,
   return std::make_unique(Triple, Opts);
 }
 
+  case llvm::Triple::le32:
+switch (os) {
+case llvm::Triple::NaCl:
+  return std::make_unique>(Triple, Opts);
+default:
+  return nullptr;
+}
+
+  case llvm::Triple::le64:
+return std::make_unique(Triple, Opts);
+
   case llvm::Triple::ppc:
 switch (os) {
 case llvm::Triple::Linux:

diff  --git a/clang/lib/Basic/Targets/Le64.cpp 
b/clang/lib/Basic/Targets/Le64.cpp
new file mode 100644
index 0..f7afa0e747d67
--- /dev/null
+++ b/clang/lib/Basic/Targets/Le64.cpp
@@ -0,0 +1,30 @@
+//===--- Le64.cpp - Implement Le64 target feature support 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This file implements Le64 TargetInfo objects.
+//
+//===--===//
+
+#include "Le64.h"
+#include "Targets.h"
+#include "clang/Basic/Builtins.h"
+#include "clang/Basic/MacroBuilder.h"
+#include "clang/Basic/TargetBuiltins.h"
+
+using namespace clang;
+using namespace clang::targets;
+
+ArrayRef Le64TargetInfo::getTargetBuiltins() const {
+  return {};
+}
+
+void Le64TargetInfo::getTargetDefines(const LangOptions &Opts,
+  MacroBuilder &Builder) const {
+  DefineStd(Builder, "unix", Opts);
+  defineCPUMacros(Builder, "le64", /*Tuning=*/false);
+}

diff  --git a/clang/lib/Basic/Targets/Le64.h b/clang/lib/Basic/Targets/Le64.h
new file mode 100644
index 0..45f6a4e9dd75d
--- /dev/nu

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-07-16 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96872

>From f9e72c468c5611797d0e2dd0929799a063f4c20b Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 11 Jun 2024 10:58:44 +0200
Subject: [PATCH 1/2] clang/AMDGPU: Emit atomicrmw for
 __builtin_amdgcn_global_atomic_fadd_{f32|f64}

Need to emit syncscope and new metadata to get the native instruction,
most of the time.
---
 clang/lib/CodeGen/CGBuiltin.cpp   | 39 +--
 .../CodeGenOpenCL/builtins-amdgcn-gfx11.cl|  2 +-
 .../builtins-fp-atomics-gfx12.cl  |  4 +-
 .../builtins-fp-atomics-gfx90a.cl |  4 +-
 .../builtins-fp-atomics-gfx940.cl |  4 +-
 5 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 67027f8aa93f3..5f81303972eaa 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MatrixBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -18650,8 +18651,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18663,18 +18662,11 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   ArgTy = llvm::FixedVectorType::get(
   llvm::Type::getHalfTy(getLLVMContext()), 2);
   IID = Intrinsic::amdgcn_global_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -19089,7 +19081,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
   case AMDGPU::BI__builtin_amdgcn_ds_faddf:
   case AMDGPU::BI__builtin_amdgcn_ds_fminf:
-  case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
+  case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19105,6 +19099,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19139,8 +19135,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
   EmitScalarExpr(E->getArg(3)), AO, SSID);
 } else {
-  // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
-  SSID = llvm::SyncScope::System;
+  // Most of the builtins do not have syncscope/order arguments. For DS
+  // atomics the scope doesn't really matter, as they implicitly operate at
+  // workgroup scope.
+  //
+  // The global/flat cases need to use agent scope to consistently produce
+  // the native instruction instead of a cmpxchg expansion.
+  SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
   AO = AtomicOrdering::SequentiallyConsistent;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19155,6 +19156,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
 if (Volatile)
   RMW->setVolatile(true);
+
+unsigned AddrSpace = Ptr.getType()->getAddressSpace();
+if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
+  // Most targets require "amdgpu.no.fine.grained.memory" to emit the 
nativ

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from {global|flat}_atomic_fadd_v2f16 builtins (PR #96873)

2024-07-16 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96873

>From 6083c312c60842002a38951a4d20d8045ff9e812 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:12:59 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from
 {global|flat}_atomic_fadd_v2f16 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 20 ++-
 .../builtins-fp-atomics-gfx12.cl  |  9 ++---
 .../builtins-fp-atomics-gfx90a.cl |  2 +-
 .../builtins-fp-atomics-gfx940.cl |  3 ++-
 4 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b183d7f8122d9..bb1e3e222630b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18651,22 +18651,15 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -18686,11 +18679,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ArgTy = llvm::Type::getFloatTy(getLLVMContext());
   IID = Intrinsic::amdgcn_flat_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -19083,7 +19071,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_fminf:
   case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19101,6 +19091,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 6b8a6d14575db..07e63a8711c7f 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -48,7 +48,8 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_flat_add_2f16
-// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr 
%{{.*}}, <2 x half> %{{.*}})
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} 
syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX12-LABEL:  test_flat_add_2f16
 // GFX12: flat_atomic_pk_add_f16
 half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
@@ -64,7 +65,8 @@ short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
 }
 
 // CHECK-LABEL: test_global_add_half2
-// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr 
addrspace(1) %{{.*}}, <2 x half> %{{.*}})
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> 
%{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
+
 // GFX12-LABEL:  test_global_add_half2
 // GFX12:  global_atomic_pk_add_f16 v2, v[0:1], v2, off

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64} builtins (PR #96874)

2024-07-16 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96874

>From 20359732636e94a73ad9b3e38a00bb0c2d29078e Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:15:26 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64}
 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp | 17 ++---
 .../CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl |  6 --
 .../CodeGenOpenCL/builtins-fp-atomics-gfx940.cl |  3 ++-
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index bb1e3e222630b..33cd7db29d993 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18653,10 +18653,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   }
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
@@ -18666,19 +18664,12 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmax;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmin;
   break;
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmax;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -19073,7 +19064,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19093,6 +19086,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index cd10777dbe079..02e289427238f 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -45,7 +45,8 @@ void test_global_max_f64(__global double *addr, double x){
 }
 
 // CHECK-LABEL: test_flat_add_local_f64
-// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr 
addrspace(3) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8{{$}}
+
 // GFX90A-LABEL:  test_flat_add_local_f64$local
 // GFX90A:  ds_add_rtn_f64
 void test_flat_add_local_f64(__local double *addr, double x){
@@ -54,7 +55,8 @@ void test_flat_add_local_f64(__local double *addr, double x){
 }
 
 // CHECK-LABEL: test_flat_global_add_f64
-// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_flat_global_add_f64$local
 // GFX90A:  global_atomic_add_f64
 void test_flat_global_add_f64(__global double *addr, double x){
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
index 589dcd406630d..bd9b8c7268e06 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
@@ -10,7 +10,8 @@ typedef half  _

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins (PR #96875)

2024-07-16 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96875

>From bbcab9f7064bb74d14940c118c4b26bdb44e7838 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:34:43 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16
 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 26 ++-
 .../builtins-fp-atomics-gfx12.cl  | 24 -
 .../builtins-fp-atomics-gfx90a.cl |  6 ++---
 .../builtins-fp-atomics-gfx940.cl | 14 +++---
 4 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 33cd7db29d993..9c4adb6ece4e2 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18677,22 +18677,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
 return Builder.CreateCall(F, {Addr, Val});
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
-Intrinsic::ID IID;
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
-  IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19066,7 +19050,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19088,6 +19074,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19132,7 +19120,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   AO = AtomicOrdering::Monotonic;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
-  if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
+  if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
+  BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
+  BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
 llvm::Type *V2BF16Ty = FixedVectorType::get(
 llvm::Type::getBFloatTy(Builder.getContext()), 2);
 Val = Builder.CreateBitCast(Val, V2BF16Ty);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 07e63a8711c7f..e8b6eb57c38d7 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
 
 // CHECK-LABEL: test_local_add_2bf16
 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> 
[[BC0]] syncscope("agent") monotonic, align 4
+// CHECK-NEXT: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x 
bfloat> [[BC0]] syncscope("agent") monotonic, align 4
 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
 
 // GFX12-LABEL:  test_local_add_2bf16
@@ -48,7 +48,7 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_flat_add_2f16
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} 
syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} 
syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
 
 // GFX12-LABEL:  test_flat_add_2f

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max f64 builtins (PR #96876)

2024-07-16 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96876

>From 91067b9e4faf66bc8237ffb27dd02436b2305766 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 23:18:32 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max
 f64 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 36 +--
 .../builtins-fp-atomics-gfx90a.cl | 18 ++
 2 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9c4adb6ece4e2..f6ff1d5dc98c1 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18651,32 +18651,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
-Intrinsic::ID IID;
-llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmax;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmax;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F =
-CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19052,7 +19026,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19079,8 +19057,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   BinOp = llvm::AtomicRMWInst::FMin;
   break;
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
   BinOp = llvm::AtomicRMWInst::FMax;
   break;
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index 9381ce951df3e..556e553903d1a 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -27,7 +27,8 @@ void test_global_add_half2(__global half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_global_global_min_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmin ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_global_min_f64$local
 // GFX90A:  global_atomic_min_f64
 void test_global_global_min_f64(__global double *addr, double x){
@@ -36,7 +37,8 @@ void test_global_global_min_f64(__global double *addr, double 
x){
 }
 
 // CHECK-LABEL: test_global_max_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmax ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_max_f64$local
 // GFX90A:  global_atomic_max_f64
 void test_global_max_f64(__global double *addr, double x){
@@ -65,7 +67,8 @@ void test_flat_global_add_f64(__global double *addr, doub

[llvm-branch-commits] [llvm] AMDGPU: Remove flat/global atomic fadd v2bf16 intrinsics (PR #97050)

2024-07-16 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/97050

>From a23913d09dfd1a5b4d7864cfee70b8d1b947bdf8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Thu, 27 Jun 2024 16:32:48 +0200
Subject: [PATCH] AMDGPU: Remove flat/global atomic fadd v2bf16 intrinsics

These are now fully covered by atomicrmw.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   4 -
 llvm/lib/IR/AutoUpgrade.cpp   |  14 +-
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |   2 -
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   2 -
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   2 -
 llvm/lib/Target/AMDGPU/FLATInstructions.td|   2 -
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |   6 +-
 llvm/test/Bitcode/amdgcn-atomic.ll|  22 ++
 .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll| 106 -
 .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 218 --
 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 193 
 11 files changed, 33 insertions(+), 538 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 71b1e832bde3c..9cf4d6352d23d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2907,10 +2907,6 @@ multiclass AMDGPUMFp8SmfmacIntrinsic {
 def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic;
 }
 
-// bf16 atomics use v2i16 argument since there is no bf16 data type in the 
llvm.
-def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn;
-def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn;
-
 defset list AMDGPUMFMAIntrinsics940 = {
 def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic;
 def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 53de9eef516b3..f566a0e3c3043 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1034,7 +1034,9 @@ static bool upgradeIntrinsicFunction1(Function *F, 
Function *&NewFn,
   }
 
   if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
-  Name.starts_with("ds.fmax")) {
+  Name.starts_with("ds.fmax") ||
+  Name.starts_with("global.atomic.fadd.v2bf16") ||
+  Name.starts_with("flat.atomic.fadd.v2bf16")) {
 // Replaced with atomicrmw fadd/fmin/fmax, so there's no new
 // declaration.
 NewFn = nullptr;
@@ -4042,7 +4044,9 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, 
CallBase *CI,
   .StartsWith("ds.fmin", AtomicRMWInst::FMin)
   .StartsWith("ds.fmax", AtomicRMWInst::FMax)
   .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
-  .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
+  .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap)
+  .StartsWith("global.atomic.fadd", AtomicRMWInst::FAdd)
+  .StartsWith("flat.atomic.fadd", AtomicRMWInst::FAdd);
 
   unsigned NumOperands = CI->getNumOperands();
   if (NumOperands < 3) // Malformed bitcode.
@@ -4097,8 +4101,10 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, 
CallBase *CI,
   Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID);
 
   if (PtrTy->getAddressSpace() != 3) {
-RMW->setMetadata("amdgpu.no.fine.grained.memory",
- MDNode::get(F->getContext(), {}));
+MDNode *EmptyMD = MDNode::get(F->getContext(), {});
+RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
+if (RMWOp == AtomicRMWInst::FAdd && RetTy->isFloatTy())
+  RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
   }
 
   if (IsVolatile)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index c6dbc58395e48..db8b44149cf47 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -620,12 +620,10 @@ multiclass local_addr_space_atomic_op {
 
 defm int_amdgcn_flat_atomic_fadd : noret_op;
 defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op;
-defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op;
 defm int_amdgcn_flat_atomic_fmin : noret_op;
 defm int_amdgcn_flat_atomic_fmax : noret_op;
 defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op;
 defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op;
-defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op;
 defm int_amdgcn_global_atomic_fmin : noret_op;
 defm int_amdgcn_global_atomic_fmax : noret_op;
 defm int_amdgcn_global_atomic_csub : noret_op;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 17413ab55536d..91d033fa3ba53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4897,8 +4897,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const 
MachineInstr &MI) const {
 case Intrinsic::amdgcn_flat_atomic_fmax:
 case Intrinsic

[llvm-branch-commits] [compiler-rt] 601bd9d - Revert "[compiler-rt] adding preadv2/pwritev2 interceptions. (#97216)"

2024-07-16 Thread via llvm-branch-commits

Author: Daniel Thornburgh
Date: 2024-07-16T12:18:41-07:00
New Revision: 601bd9d8648398ffa1fe69f48304c2369e97e2db

URL: 
https://github.com/llvm/llvm-project/commit/601bd9d8648398ffa1fe69f48304c2369e97e2db
DIFF: 
https://github.com/llvm/llvm-project/commit/601bd9d8648398ffa1fe69f48304c2369e97e2db.diff

LOG: Revert "[compiler-rt] adding preadv2/pwritev2 interceptions. (#97216)"

This reverts commit bc1c84aee5b33c30e7bfe1e4a65a64650ec357db.

Added: 


Modified: 
compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h

Removed: 
compiler-rt/test/sanitizer_common/TestCases/Linux/preadv2.cpp



diff  --git 
a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc 
b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 032b04a09ae76..a6066a6226e1b 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -10264,38 +10264,6 @@ INTERCEPTOR(int, cpuset_getaffinity, int level, int 
which, __int64_t id, SIZE_T
 #define INIT_CPUSET_GETAFFINITY
 #endif
 
-#if SANITIZER_INTERCEPT_PREADV2
-INTERCEPTOR(SSIZE_T, preadv2, int fd, __sanitizer_iovec *iov, int iovcnt,
-OFF_T offset, int flags) {
-  void *ctx;
-  COMMON_INTERCEPTOR_ENTER(ctx, preadv2, fd, iov, iovcnt, offset, flags);
-  COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
-  SSIZE_T res = REAL(preadv2)(fd, iov, iovcnt, offset, flags);
-  if (res > 0) write_iovec(ctx, iov, iovcnt, res);
-  if (res >= 0 && fd >= 0) COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd);
-  return res;
-}
-#define INIT_PREADV2 COMMON_INTERCEPT_FUNCTION(preadv2)
-#else
-#define INIT_PREADV2
-#endif
-
-#if SANITIZER_INTERCEPT_PWRITEV2
-INTERCEPTOR(SSIZE_T, pwritev2, int fd, __sanitizer_iovec *iov, int iovcnt,
-OFF_T offset, int flags) {
-  void *ctx;
-  COMMON_INTERCEPTOR_ENTER(ctx, pwritev2, fd, iov, iovcnt, offset, flags);
-  COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
-  if (fd >= 0) COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
-  SSIZE_T res = REAL(pwritev2)(fd, iov, iovcnt, offset, flags);
-  if (res > 0) read_iovec(ctx, iov, iovcnt, res);
-  return res;
-}
-#define INIT_PWRITEV2 COMMON_INTERCEPT_FUNCTION(pwritev2)
-#else
-#define INIT_PWRITEV2
-#endif
-
 #include "sanitizer_common_interceptors_netbsd_compat.inc"
 
 namespace __sanitizer {
@@ -10615,8 +10583,6 @@ static void InitializeCommonInterceptors() {
   INIT___XUNAME;
   INIT_ARGP_PARSE;
   INIT_CPUSET_GETAFFINITY;
-  INIT_PREADV2;
-  INIT_PWRITEV2;
 
   INIT___PRINTF_CHK;
 }

diff  --git 
a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h 
b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index c94368b6b0ebb..de55c736d0e14 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -598,9 +598,6 @@
 #define SANITIZER_INTERCEPT_PROCCTL SI_FREEBSD
 #define SANITIZER_INTERCEPT_ARGP_PARSE SI_GLIBC
 #define SANITIZER_INTERCEPT_CPUSET_GETAFFINITY SI_FREEBSD
-// FIXME: also available from musl 1.2.5
-#define SANITIZER_INTERCEPT_PREADV2 SI_GLIBC
-#define SANITIZER_INTERCEPT_PWRITEV2 SI_GLIBC
 
 // This macro gives a way for downstream users to override the above
 // interceptor macros irrespective of the platform they are on. They have

diff  --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/preadv2.cpp 
b/compiler-rt/test/sanitizer_common/TestCases/Linux/preadv2.cpp
deleted file mode 100644
index 176347f78ecdc..0
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/preadv2.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: %clangxx -O0 %s -o %t
-
-// REQUIRES: glibc
-
-#include 
-#include 
-#include 
-#include 
-
-int main(void) {
-  int fd = open("/proc/self/stat", O_RDONLY);
-  char bufa[7];
-  char bufb[7];
-  struct iovec vec[2];
-  vec[0].iov_base = bufa + 4;
-  vec[0].iov_len = 1;
-  vec[1].iov_base = bufb;
-  vec[1].iov_len = sizeof(bufb);
-  ssize_t rd = preadv2(fd, vec, 2, 0, 0);
-  assert(rd > 0);
-  vec[0].iov_base = bufa;
-  rd = preadv2(fd, vec, 2, 0, 0);
-  assert(rd > 0);
-  rd = preadv2(fd, vec, 5, -25, 0);
-  assert(rd < 0);
-  close(fd);
-  return 0;
-}



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] a4c3229 - Revert "[tsan] Replace ALIGNED with alignas"

2024-07-16 Thread via llvm-branch-commits

Author: Daniel Thornburgh
Date: 2024-07-16T14:09:39-07:00
New Revision: a4c3229ed0dac390bd3585f4cda5a1daaa3e97fc

URL: 
https://github.com/llvm/llvm-project/commit/a4c3229ed0dac390bd3585f4cda5a1daaa3e97fc
DIFF: 
https://github.com/llvm/llvm-project/commit/a4c3229ed0dac390bd3585f4cda5a1daaa3e97fc.diff

LOG: Revert "[tsan] Replace ALIGNED with alignas"

This reverts commit 656f617ac772c54e0bee9d499e7ca232137ddb35.

Added: 


Modified: 
compiler-rt/lib/tsan/rtl/tsan_defs.h
compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp
compiler-rt/lib/tsan/rtl/tsan_mman.cpp
compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
compiler-rt/lib/tsan/rtl/tsan_rtl.h
compiler-rt/lib/tsan/rtl/tsan_suppressions.cpp
compiler-rt/lib/tsan/rtl/tsan_vector_clock.h

Removed: 




diff  --git a/compiler-rt/lib/tsan/rtl/tsan_defs.h 
b/compiler-rt/lib/tsan/rtl/tsan_defs.h
index 270d441dc90b7..1ffa3d6aec40b 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_defs.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_defs.h
@@ -30,7 +30,7 @@
 #  define __MM_MALLOC_H
 #  include 
 #  include 
-#  define VECTOR_ALIGNED alignas(16)
+#  define VECTOR_ALIGNED ALIGNED(16)
 typedef __m128i m128;
 #else
 #  define VECTOR_ALIGNED

diff  --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp 
b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
index 9cab2a3727128..034ae3d322b56 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
@@ -208,7 +208,7 @@ struct AtExitCtx {
 struct InterceptorContext {
   // The object is 64-byte aligned, because we want hot data to be located
   // in a single cache line if possible (it's accessed in every interceptor).
-  alignas(64) LibIgnore libignore;
+  ALIGNED(64) LibIgnore libignore;
   __sanitizer_sigaction sigactions[kSigCount];
 #if !SANITIZER_APPLE && !SANITIZER_NETBSD
   unsigned finalize_key;
@@ -220,7 +220,7 @@ struct InterceptorContext {
   InterceptorContext() : libignore(LINKER_INITIALIZED), 
atexit_mu(MutexTypeAtExit), AtExitStack() {}
 };
 
-alignas(64) static char interceptor_placeholder[sizeof(InterceptorContext)];
+static ALIGNED(64) char interceptor_placeholder[sizeof(InterceptorContext)];
 InterceptorContext *interceptor_ctx() {
   return reinterpret_cast(&interceptor_placeholder[0]);
 }

diff  --git a/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp 
b/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp
index befd6a369026d..5154662034c56 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp
@@ -76,7 +76,7 @@ struct DynamicAnnContext {
 };
 
 static DynamicAnnContext *dyn_ann_ctx;
-alignas(64) static char dyn_ann_ctx_placeholder[sizeof(DynamicAnnContext)];
+static char dyn_ann_ctx_placeholder[sizeof(DynamicAnnContext)] ALIGNED(64);
 
 static void AddExpectRace(ExpectRace *list,
 char *f, int l, uptr addr, uptr size, char *desc) {

diff  --git a/compiler-rt/lib/tsan/rtl/tsan_mman.cpp 
b/compiler-rt/lib/tsan/rtl/tsan_mman.cpp
index 0705365d77427..e129e9af272f5 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_mman.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_mman.cpp
@@ -54,7 +54,7 @@ struct MapUnmapCallback {
   }
 };
 
-alignas(64) static char allocator_placeholder[sizeof(Allocator)];
+static char allocator_placeholder[sizeof(Allocator)] ALIGNED(64);
 Allocator *allocator() {
   return reinterpret_cast(&allocator_placeholder);
 }
@@ -75,7 +75,7 @@ struct GlobalProc {
 internal_alloc_mtx(MutexTypeInternalAlloc) {}
 };
 
-alignas(64) static char global_proc_placeholder[sizeof(GlobalProc)];
+static char global_proc_placeholder[sizeof(GlobalProc)] ALIGNED(64);
 GlobalProc *global_proc() {
   return reinterpret_cast(&global_proc_placeholder);
 }

diff  --git a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp 
b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
index c8a66e60a69f1..07d83e1a9a9ff 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
@@ -46,7 +46,7 @@
 namespace __tsan {
 
 #if !SANITIZER_GO
-static char main_thread_state[sizeof(ThreadState)] alignas(
+static char main_thread_state[sizeof(ThreadState)] ALIGNED(
 SANITIZER_CACHE_LINE_SIZE);
 static ThreadState *dead_thread_state;
 static pthread_key_t thread_state_key;

diff  --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp 
b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
index bf29aa316f680..e5ebb65754b32 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
@@ -48,10 +48,11 @@ int (*on_finalize)(int);
 #endif
 
 #if !SANITIZER_GO && !SANITIZER_APPLE
-alignas(SANITIZER_CACHE_LINE_SIZE) THREADLOCAL __attribute__((tls_model(
-"initial-exec"))) char cur_thread_placeholder[sizeof(ThreadState)];
+__attribute__((tls_model("initial-exec")))
+T

[llvm-branch-commits] [llvm] [BOLT] Match functions with call graph (PR #98125)

2024-07-16 Thread Shaw Young via llvm-branch-commits

https://github.com/shawbyoung updated 
https://github.com/llvm/llvm-project/pull/98125

>From cf32a43e7c2b04079c6123fe13df4fb7226d771f Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Tue, 9 Jul 2024 10:04:25 -0700
Subject: [PATCH 01/13] Comments

Created using spr 1.3.4
---
 bolt/lib/Profile/YAMLProfileReader.cpp | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp 
b/bolt/lib/Profile/YAMLProfileReader.cpp
index 69ea0899c5f2c..6753337c24ea7 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -501,7 +501,6 @@ size_t YAMLProfileReader::matchWithCallGraph(BinaryContext 
&BC) {
 
   // Maps binary functions to adjacent functions in the FCG.
   for (const BinaryFunction *CallerBF : BFs) {
-// Add all call targets to the hash map.
 for (const BinaryBasicBlock &BB : CallerBF->blocks()) {
   for (const MCInst &Inst : BB) {
 if (!BC.MIB->isCall(Instr))
@@ -533,7 +532,8 @@ size_t YAMLProfileReader::matchWithCallGraph(BinaryContext 
&BC) {
 }
   }
 
-  // Create mapping from neighbor hash to BFs.
+  // Using the constructed adjacent function mapping, creates mapping from
+  // neighbor hash to BFs.
   std::unordered_map>
   NeighborHashToBFs;
   for (const BinaryFunction *BF : BFs) {
@@ -552,12 +552,12 @@ size_t 
YAMLProfileReader::matchWithCallGraph(BinaryContext &BC) {
 .push_back(BF);
   }
 
-  // TODO: change call anchor PR to have this representation - we need it here
+  // TODO: note, this will be introduced in the matching functions with calls
+  // as anchors pr
   DenseMap
   IdToYAMLBF;
-  // TODO: change call anchor PR to have this representation - we need it here
 
-  // Maps hashes to profiled functions.
+  // Maps YAML functions to adjacent functions in the profile FCG.
   std::unordered_map
   YamlBFToHashes(BFs.size());
@@ -590,7 +590,7 @@ size_t YAMLProfileReader::matchWithCallGraph(BinaryContext 
&BC) {
 }
   }
 
-  // Matching YAMLBF with neighbor hashes.
+  // Matches YAMLBF to BFs with neighbor hashes.
   for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
 if (YamlBF.Used)
   continue;

>From ee9049fc4bd3d4203c19c9c0982a78ab3b47666f Mon Sep 17 00:00:00 2001
From: shawbyoung 
Date: Tue, 9 Jul 2024 13:52:05 -0700
Subject: [PATCH 02/13] Moved blended hash definition

Created using spr 1.3.4
---
 bolt/include/bolt/Profile/YAMLProfileReader.h |  69 ++-
 bolt/lib/Profile/StaleProfileMatching.cpp |  65 ---
 bolt/lib/Profile/YAMLProfileReader.cpp| 110 --
 3 files changed, 119 insertions(+), 125 deletions(-)

diff --git a/bolt/include/bolt/Profile/YAMLProfileReader.h 
b/bolt/include/bolt/Profile/YAMLProfileReader.h
index 36e8f8739eee1..e8a34ecad9a08 100644
--- a/bolt/include/bolt/Profile/YAMLProfileReader.h
+++ b/bolt/include/bolt/Profile/YAMLProfileReader.h
@@ -16,6 +16,73 @@
 namespace llvm {
 namespace bolt {
 
+/// An object wrapping several components of a basic block hash. The combined
+/// (blended) hash is represented and stored as one uint64_t, while individual
+/// components are of smaller size (e.g., uint16_t or uint8_t).
+struct BlendedBlockHash {
+private:
+  using ValueOffset = Bitfield::Element;
+  using ValueOpcode = Bitfield::Element;
+  using ValueInstr = Bitfield::Element;
+  using ValuePred = Bitfield::Element;
+  using ValueSucc = Bitfield::Element;
+
+public:
+  explicit BlendedBlockHash() {}
+
+  explicit BlendedBlockHash(uint64_t Hash) {
+Offset = Bitfield::get(Hash);
+OpcodeHash = Bitfield::get(Hash);
+InstrHash = Bitfield::get(Hash);
+PredHash = Bitfield::get(Hash);
+SuccHash = Bitfield::get(Hash);
+  }
+
+  /// Combine the blended hash into uint64_t.
+  uint64_t combine() const {
+uint64_t Hash = 0;
+Bitfield::set(Hash, Offset);
+Bitfield::set(Hash, OpcodeHash);
+Bitfield::set(Hash, InstrHash);
+Bitfield::set(Hash, PredHash);
+Bitfield::set(Hash, SuccHash);
+return Hash;
+  }
+
+  /// Compute a distance between two given blended hashes. The smaller the
+  /// distance, the more similar two blocks are. For identical basic blocks,
+  /// the distance is zero.
+  uint64_t distance(const BlendedBlockHash &BBH) const {
+assert(OpcodeHash == BBH.OpcodeHash &&
+   "incorrect blended hash distance computation");
+uint64_t Dist = 0;
+// Account for NeighborHash
+Dist += SuccHash == BBH.SuccHash ? 0 : 1;
+Dist += PredHash == BBH.PredHash ? 0 : 1;
+Dist <<= 16;
+// Account for InstrHash
+Dist += InstrHash == BBH.InstrHash ? 0 : 1;
+Dist <<= 16;
+// Account for Offset
+Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset);
+return Dist;
+  }
+
+  /// The offset of the basic block from the function start.
+  uint16_t Offset{0};
+  /// (Loose) Hash of the basic block instructions, excluding operands.
+  uint16_t OpcodeHash{0};
+  /// (Str

[llvm-branch-commits] [llvm] [BOLT] Match functions with call graph (PR #98125)

2024-07-16 Thread Shaw Young via llvm-branch-commits


@@ -446,6 +503,56 @@ size_t YAMLProfileReader::matchWithLTOCommonName() {
   return MatchedWithLTOCommonName;
 }
 
+size_t YAMLProfileReader::matchWithCallGraph(BinaryContext &BC) {
+  if (!opts::MatchWithCallGraph)
+return 0;
+
+  size_t MatchedWithCallGraph = 0;
+  CGMatcher.computeBFNeighborHashes(BC);
+  CGMatcher.constructYAMLFCG(YamlBP, IdToYamLBF);
+
+  // Matches YAMLBF to BFs with neighbor hashes.
+  for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
+if (YamlBF.Used)
+  continue;
+auto It = CGMatcher.YamlBFAdjacencyMap.find(&YamlBF);
+if (It == CGMatcher.YamlBFAdjacencyMap.end())
+  continue;
+// Computes profiled function's neighbor hash.
+std::set &AdjacentFunctions =
+It->second;
+std::string AdjacentFunctionHashStr;
+for (auto &AdjacentFunction : AdjacentFunctions) {
+  AdjacentFunctionHashStr += AdjacentFunction->Name;
+}
+uint64_t Hash = std::hash{}(AdjacentFunctionHashStr);
+auto NeighborHashToBFsIt = CGMatcher.NeighborHashToBFs.find(Hash);
+if (NeighborHashToBFsIt == CGMatcher.NeighborHashToBFs.end())
+  continue;
+// Finds the binary function with the closest block size to the profiled

shawbyoung wrote:

1. In a binary with 953488 binary functions and 13376 profiled functions, the 
largest bucket had 151 functions. This however was an outlier - the median & 
mean bucket size was 2 and ~5 respectively and more than half of buckets only 
had one binary function. 
2. I agree - just implemented LCP name matching as opposed to block count.

https://github.com/llvm/llvm-project/pull/98125
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT] Match functions with call graph (PR #98125)

2024-07-16 Thread Shaw Young via llvm-branch-commits

https://github.com/shawbyoung edited 
https://github.com/llvm/llvm-project/pull/98125
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] libc: Use UMAXV.4S to reduce bcmp result. (PR #99260)

2024-07-16 Thread via llvm-branch-commits

https://github.com/pcc created https://github.com/llvm/llvm-project/pull/99260

We can use UMAXV.4S to reduce the comparison result in a single
instruction. This improves performance by roughly 4% on Apple M1:

Summary
  bin/libc.src.string.bcmp_benchmark3 --study-name="new bcmp" --sweep-mode 
--sweep-max-size=128 --output=/dev/null --num-trials=10 ran
1.01 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark3 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.01 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark3 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.01 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark3 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.01 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark2 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.02 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark2 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.03 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark2 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.03 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark2 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.05 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark1 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.05 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark1 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.05 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark1 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.05 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark1 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10

(1 = original, 2 = a variant of this patch that uses UMAXV.16B, 3 = this patch)



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] libc: Use UMAXV.4S to reduce bcmp result. (PR #99260)

2024-07-16 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-libc

Author: None (pcc)


Changes

We can use UMAXV.4S to reduce the comparison result in a single
instruction. This improves performance by roughly 4% on Apple M1:

Summary
  bin/libc.src.string.bcmp_benchmark3 --study-name="new bcmp" --sweep-mode 
--sweep-max-size=128 --output=/dev/null --num-trials=10 ran
1.01 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark3 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.01 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark3 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.01 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark3 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.01 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark2 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.02 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark2 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.03 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark2 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.03 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark2 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.05 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark1 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.05 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark1 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.05 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark1 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10
1.05 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark1 
--study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null 
--num-trials=10

(1 = original, 2 = a variant of this patch that uses UMAXV.16B, 3 = this patch)


---
Full diff: https://github.com/llvm/llvm-project/pull/99260.diff


1 Files Affected:

- (modified) libc/src/string/memory_utils/op_aarch64.h (+6-12) 


``diff
diff --git a/libc/src/string/memory_utils/op_aarch64.h 
b/libc/src/string/memory_utils/op_aarch64.h
index 1090ea2617f09..5c08a6ae48b04 100644
--- a/libc/src/string/memory_utils/op_aarch64.h
+++ b/libc/src/string/memory_utils/op_aarch64.h
@@ -84,8 +84,7 @@ template  struct Bcmp {
   uint8x16_t a = vld1q_u8(_p1);
   uint8x16_t n = vld1q_u8(_p2);
   uint8x16_t an = veorq_u8(a, n);
-  uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
-  return vmaxv_u32(an_reduced);
+  return vmaxvq_u32(vreinterpretq_u32_u8(an));
 } else if constexpr (Size == 32) {
   auto _p1 = as_u8(p1);
   auto _p2 = as_u8(p2);
@@ -97,12 +96,9 @@ template  struct Bcmp {
   uint8x16_t bo = veorq_u8(b, o);
   // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
   // a difference between the two buffers.  We reduce this value down to 4
-  // bytes in two steps. First, calculate the saturated move value when
-  // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
-  // a single 32 bit nonzero value if a mismatch occurred.
+  // bytes using the UMAXV instruction to compute the max across the 
vector.
   uint8x16_t anbo = vorrq_u8(an, bo);
-  uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
-  return vmaxv_u32(anbo_reduced);
+  return vmaxvq_u32(vreinterpretq_u32_u8(anbo));
 } else if constexpr ((Size % BlockSize) == 0) {
   for (size_t offset = 0; offset < Size; offset += BlockSize)
 if (auto value = Bcmp::block(p1 + offset, p2 + offset))
@@ -129,8 +125,7 @@ template  struct Bcmp {
   uint8x16_t bo = veorq_u8(b, o);
   // anbo = (a ^ n) | (b ^ o)
   uint8x16_t anbo = vorrq_u8(an, bo);
-  uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
-  return vmaxv_u32(anbo_reduced);
+  return vmaxvq_u32(vreinterpretq_u32_u8(anbo));
 } else if constexpr (Size == 32) {
   auto _p1 = as_u8(p1);
   auto _p2 = as_u8(p2);
@@ -150,9 +145,8 @@ template  struct Bcmp {
   uint8x16_t cpdq = vorrq_u8(cp, dq);
   // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
   // a nonzero 32 bit value if a mismatch occurred.
-  uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
-  uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
-  return vmaxv_u32(abnocpdq_reduced);
+  uint8x16_t abnocpdq = anbo | cpdq;
+  return vmaxvq_u32(vreinterpretq_u32_u8(abnocpdq));
 } else {
   static_assert(cpp::always_false, "SIZE not implemented"

[llvm-branch-commits] [libc] Use UMAXV.4S to reduce bcmp result. (PR #99260)

2024-07-16 Thread via llvm-branch-commits

https://github.com/lntue edited https://github.com/llvm/llvm-project/pull/99260
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle new atomicrmw metadata for fadd case (PR #96760)

2024-07-16 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/96760
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle new atomicrmw metadata for fadd case (PR #96760)

2024-07-16 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm commented:

ping

https://github.com/llvm/llvm-project/pull/96760
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits