from:"Durgadoss R via cfe\-commits"

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Durgadoss R via cfe-commits


https://github.com/durga4github created 
https://github.com/llvm/llvm-project/pull/94735

This patch adds APFloat type support for two FP6 data types, E2M3 and E3M2.
The definitions for the two formats are detailed in section 5.3.2 of the
OCP specification, which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

>From 21432aeaeecc6ba06de252c460e5cb09abb68a29 Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Wed, 5 Jun 2024 19:22:31 +0530
Subject: [PATCH] [APFloat] Add APFloat support for FP6 data types

This patch adds APFloat type support for two FP6
data types, E2M3 and E3M2. The definitions for the
two formats are detailed in section 5.3.2 of the
OCP specification, which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

Signed-off-by: Durgadoss R 
---
 clang/lib/AST/MicrosoftMangle.cpp  |   2 +
 llvm/include/llvm/ADT/APFloat.h|  14 +
 llvm/lib/Support/APFloat.cpp   |  89 +-
 llvm/unittests/ADT/APFloatTest.cpp | 496 +++--
 4 files changed, 567 insertions(+), 34 deletions(-)

diff --git a/clang/lib/AST/MicrosoftMangle.cpp 
b/clang/lib/AST/MicrosoftMangle.cpp
index 36d611750ca48..72c79dab6bdcc 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -899,6 +899,8 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat 
Number) {
   case APFloat::S_Float8E4M3FNUZ:
   case APFloat::S_Float8E4M3B11FNUZ:
   case APFloat::S_FloatTF32:
+  case APFloat::S_Float6E3M2FN:
+  case APFloat::S_Float6E2M3FN:
 llvm_unreachable("Tried to mangle unexpected APFloat semantics");
   }
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 44a301ecc9928..149b7a165c9d4 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -189,6 +189,14 @@ struct APFloatBase {
 // improved range compared to half (16-bit) formats, at (potentially)
 // greater throughput than single precision (32-bit) formats.
 S_FloatTF32,
+// 6-bit floating point number with bit layout S1E3M2. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float6E3M2FN,
+// 6-bit floating point number with bit layout S1E2M3. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float6E2M3FN,
 
 S_x87DoubleExtended,
 S_MaxSemantics = S_x87DoubleExtended,
@@ -209,6 +217,8 @@ struct APFloatBase {
   static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
   static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
   static const fltSemantics &FloatTF32() LLVM_READNONE;
+  static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
+  static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
   static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -627,6 +637,8 @@ class IEEEFloat final : public APFloatBase {
   APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
   APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const;
   APInt convertFloatTF32APFloatToAPInt() const;
+  APInt convertFloat6E3M2FNAPFloatToAPInt() const;
+  APInt convertFloat6E2M3FNAPFloatToAPInt() const;
   void initFromAPInt(const fltSemantics *Sem, const APInt &api);
   template  void initFromIEEEAPInt(const APInt &api);
   void initFromHalfAPInt(const APInt &api);
@@ -642,6 +654,8 @@ class IEEEFloat final : public APFloatBase {
   void initFromFloat8E4M3FNUZAPInt(const APInt &api);
   void initFromFloat8E4M3B11FNUZAPInt(const APInt &api);
   void initFromFloatTF32APInt(const APInt &api);
+  void initFromFloat6E3M2FNAPInt(const APInt &api);
+  void initFromFloat6E2M3FNAPInt(const APInt &api);
 
   void assign(const IEEEFloat &);
   void copySignificand(const IEEEFloat &);
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 283fcc153b33a..b8ca56d96efe4 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior {
   // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
+
+  // This behavior is present in Float6E3M2FN and Float6E2M3FN types.
+  // There is no representation for Inf or NaN.
+  NoNanInf,
 };
 
 // How NaN values are represented. This is curently only used in combination
@@ -139,6 +143,10 @@ static constexpr fltSemantics semFloat8E4M3FNUZ = {
 static constexpr fltSemantics semFloat8E4M3B11FNUZ = {
 4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
 static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19};
+static co

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Durgadoss R via cfe-commits


https://github.com/durga4github updated 
https://github.com/llvm/llvm-project/pull/94735

>From ac137c57ee35e1662b40796637eb4b25aa773849 Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Wed, 5 Jun 2024 19:22:31 +0530
Subject: [PATCH] [APFloat] Add APFloat support for FP6 data types

This patch adds APFloat type support for two FP6
data types, E2M3 and E3M2. The definitions for the
two formats are detailed in section 5.3.2 of the
OCP specification, which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

Signed-off-by: Durgadoss R 
---
 clang/lib/AST/MicrosoftMangle.cpp  |   2 +
 llvm/include/llvm/ADT/APFloat.h|  14 +
 llvm/lib/Support/APFloat.cpp   |  89 +-
 llvm/unittests/ADT/APFloatTest.cpp | 488 +++--
 4 files changed, 559 insertions(+), 34 deletions(-)

diff --git a/clang/lib/AST/MicrosoftMangle.cpp 
b/clang/lib/AST/MicrosoftMangle.cpp
index 36d611750ca48..72c79dab6bdcc 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -899,6 +899,8 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat 
Number) {
   case APFloat::S_Float8E4M3FNUZ:
   case APFloat::S_Float8E4M3B11FNUZ:
   case APFloat::S_FloatTF32:
+  case APFloat::S_Float6E3M2FN:
+  case APFloat::S_Float6E2M3FN:
 llvm_unreachable("Tried to mangle unexpected APFloat semantics");
   }
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 44a301ecc9928..149b7a165c9d4 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -189,6 +189,14 @@ struct APFloatBase {
 // improved range compared to half (16-bit) formats, at (potentially)
 // greater throughput than single precision (32-bit) formats.
 S_FloatTF32,
+// 6-bit floating point number with bit layout S1E3M2. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float6E3M2FN,
+// 6-bit floating point number with bit layout S1E2M3. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float6E2M3FN,
 
 S_x87DoubleExtended,
 S_MaxSemantics = S_x87DoubleExtended,
@@ -209,6 +217,8 @@ struct APFloatBase {
   static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
   static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
   static const fltSemantics &FloatTF32() LLVM_READNONE;
+  static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
+  static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
   static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -627,6 +637,8 @@ class IEEEFloat final : public APFloatBase {
   APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
   APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const;
   APInt convertFloatTF32APFloatToAPInt() const;
+  APInt convertFloat6E3M2FNAPFloatToAPInt() const;
+  APInt convertFloat6E2M3FNAPFloatToAPInt() const;
   void initFromAPInt(const fltSemantics *Sem, const APInt &api);
   template  void initFromIEEEAPInt(const APInt &api);
   void initFromHalfAPInt(const APInt &api);
@@ -642,6 +654,8 @@ class IEEEFloat final : public APFloatBase {
   void initFromFloat8E4M3FNUZAPInt(const APInt &api);
   void initFromFloat8E4M3B11FNUZAPInt(const APInt &api);
   void initFromFloatTF32APInt(const APInt &api);
+  void initFromFloat6E3M2FNAPInt(const APInt &api);
+  void initFromFloat6E2M3FNAPInt(const APInt &api);
 
   void assign(const IEEEFloat &);
   void copySignificand(const IEEEFloat &);
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 283fcc153b33a..04d54ee66309c 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior {
   // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
+
+  // This behavior is present in Float6E3M2FN and Float6E2M3FN types.
+  // There is no representation for Inf or NaN.
+  NoNanInf,
 };
 
 // How NaN values are represented. This is curently only used in combination
@@ -139,6 +143,10 @@ static constexpr fltSemantics semFloat8E4M3FNUZ = {
 static constexpr fltSemantics semFloat8E4M3B11FNUZ = {
 4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
 static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19};
+static constexpr fltSemantics semFloat6E3M2FN = {
+4, -2, 3, 6, fltNonfiniteBehavior::NoNanInf};
+static constexpr fltSemantics semFloat6E2M3FN = {
+2, 0, 4, 6, fltNonfiniteBehavior::NoNanInf};
 static constexpr fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80};
 static con

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Durgadoss R via cfe-commits


durga4github wrote:

@ThomasRaoux , Could you please help review this change?

https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Durgadoss R via cfe-commits


https://github.com/durga4github updated 
https://github.com/llvm/llvm-project/pull/94735

>From 44b05720e7abe2344925158f7b76904990155500 Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Wed, 5 Jun 2024 19:22:31 +0530
Subject: [PATCH] [APFloat] Add APFloat support for FP6 data types

This patch adds APFloat type support for two FP6
data types, E2M3 and E3M2. The definitions for the
two formats are detailed in section 5.3.2 of the
OCP specification, which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

Signed-off-by: Durgadoss R 
---
 clang/lib/AST/MicrosoftMangle.cpp  |   2 +
 llvm/include/llvm/ADT/APFloat.h|  25 ++
 llvm/lib/Support/APFloat.cpp   |  89 +-
 llvm/unittests/ADT/APFloatTest.cpp | 484 +++--
 4 files changed, 566 insertions(+), 34 deletions(-)

diff --git a/clang/lib/AST/MicrosoftMangle.cpp 
b/clang/lib/AST/MicrosoftMangle.cpp
index 36d611750ca48..72c79dab6bdcc 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -899,6 +899,8 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat 
Number) {
   case APFloat::S_Float8E4M3FNUZ:
   case APFloat::S_Float8E4M3B11FNUZ:
   case APFloat::S_FloatTF32:
+  case APFloat::S_Float6E3M2FN:
+  case APFloat::S_Float6E2M3FN:
 llvm_unreachable("Tried to mangle unexpected APFloat semantics");
   }
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 44a301ecc9928..892f822309576 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -189,6 +189,14 @@ struct APFloatBase {
 // improved range compared to half (16-bit) formats, at (potentially)
 // greater throughput than single precision (32-bit) formats.
 S_FloatTF32,
+// 6-bit floating point number with bit layout S1E3M2. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float6E3M2FN,
+// 6-bit floating point number with bit layout S1E2M3. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float6E2M3FN,
 
 S_x87DoubleExtended,
 S_MaxSemantics = S_x87DoubleExtended,
@@ -209,6 +217,8 @@ struct APFloatBase {
   static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
   static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
   static const fltSemantics &FloatTF32() LLVM_READNONE;
+  static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
+  static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
   static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -627,6 +637,8 @@ class IEEEFloat final : public APFloatBase {
   APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
   APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const;
   APInt convertFloatTF32APFloatToAPInt() const;
+  APInt convertFloat6E3M2FNAPFloatToAPInt() const;
+  APInt convertFloat6E2M3FNAPFloatToAPInt() const;
   void initFromAPInt(const fltSemantics *Sem, const APInt &api);
   template  void initFromIEEEAPInt(const APInt &api);
   void initFromHalfAPInt(const APInt &api);
@@ -642,6 +654,8 @@ class IEEEFloat final : public APFloatBase {
   void initFromFloat8E4M3FNUZAPInt(const APInt &api);
   void initFromFloat8E4M3B11FNUZAPInt(const APInt &api);
   void initFromFloatTF32APInt(const APInt &api);
+  void initFromFloat6E3M2FNAPInt(const APInt &api);
+  void initFromFloat6E2M3FNAPInt(const APInt &api);
 
   void assign(const IEEEFloat &);
   void copySignificand(const IEEEFloat &);
@@ -1039,6 +1053,17 @@ class APFloat : public APFloatBase {
   /// \param Semantics - type float semantics
   static APFloat getAllOnesValue(const fltSemantics &Semantics);
 
+  static bool hasNanOrInf(const fltSemantics &Sem) {
+switch (SemanticsToEnum(Sem)) {
+  default:
+return true;
+  // Below Semantics do not support {NaN or Inf}
+  case APFloat::S_Float6E3M2FN:
+  case APFloat::S_Float6E2M3FN:
+return false;
+}
+  }
+
   /// Used to insert APFloat objects, or objects that contain APFloat objects,
   /// into FoldingSets.
   void Profile(FoldingSetNodeID &NID) const;
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 283fcc153b33a..04d54ee66309c 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior {
   // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
+
+  // This behavior is present in Float6E3M2FN and Float6E2M3FN types.
+  // There is no representation for Inf or NaN.
+  NoNanInf,
 };
 
 // How NaN values are repr

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Durgadoss R via cfe-commits



@@ -47,6 +47,10 @@ static std::string convertToString(double d, unsigned Prec, 
unsigned Pad,
   return std::string(Buffer.data(), Buffer.size());
 }
 
+static bool hasNanOrInf(APFloat::Semantics S) {
+  return (S != APFloat::S_Float6E3M2FN) && (S != APFloat::S_Float6E2M3FN);
+}

durga4github wrote:

Sure, moved it as a static-helper inside APFloat and using that here

https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Durgadoss R via cfe-commits



@@ -1881,6 +1890,20 @@ TEST(APFloatTest, getSmallest) {
   EXPECT_TRUE(test.isFiniteNonZero());
   EXPECT_TRUE(test.isDenormal());
   EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getSmallest(APFloat::Float6E3M2FN(), false);
+  expected = APFloat(APFloat::Float6E3M2FN(), "0x0.1p0");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getSmallest(APFloat::Float6E2M3FN(), false);
+  expected = APFloat(APFloat::Float6E2M3FN(), "0x0.2p0");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
 }
 

durga4github wrote:

Yes, the getZero test is added below.
We have added tests for get{Zero/Largest/Smallest/SmallestNormalized} and 
negative tests for get{Nan/Inf} cases.

Please let me know if I am still missing something.


https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Durgadoss R via cfe-commits


https://github.com/durga4github updated 
https://github.com/llvm/llvm-project/pull/94735

>From 3fd700cb6cf349218558ad8caae081629e01d986 Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Wed, 5 Jun 2024 19:22:31 +0530
Subject: [PATCH] [APFloat] Add APFloat support for FP6 data types

This patch adds APFloat type support for two FP6
data types, E2M3 and E3M2. The definitions for the
two formats are detailed in section 5.3.2 of the
OCP specification, which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

Signed-off-by: Durgadoss R 
---
 clang/lib/AST/MicrosoftMangle.cpp  |   2 +
 llvm/include/llvm/ADT/APFloat.h|  25 ++
 llvm/lib/Support/APFloat.cpp   |  89 +-
 llvm/unittests/ADT/APFloatTest.cpp | 484 +++--
 4 files changed, 566 insertions(+), 34 deletions(-)

diff --git a/clang/lib/AST/MicrosoftMangle.cpp 
b/clang/lib/AST/MicrosoftMangle.cpp
index 36d611750ca48..72c79dab6bdcc 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -899,6 +899,8 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat 
Number) {
   case APFloat::S_Float8E4M3FNUZ:
   case APFloat::S_Float8E4M3B11FNUZ:
   case APFloat::S_FloatTF32:
+  case APFloat::S_Float6E3M2FN:
+  case APFloat::S_Float6E2M3FN:
 llvm_unreachable("Tried to mangle unexpected APFloat semantics");
   }
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 44a301ecc9928..35d6d88013e9e 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -189,6 +189,14 @@ struct APFloatBase {
 // improved range compared to half (16-bit) formats, at (potentially)
 // greater throughput than single precision (32-bit) formats.
 S_FloatTF32,
+// 6-bit floating point number with bit layout S1E3M2. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float6E3M2FN,
+// 6-bit floating point number with bit layout S1E2M3. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float6E2M3FN,
 
 S_x87DoubleExtended,
 S_MaxSemantics = S_x87DoubleExtended,
@@ -209,6 +217,8 @@ struct APFloatBase {
   static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
   static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
   static const fltSemantics &FloatTF32() LLVM_READNONE;
+  static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
+  static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
   static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -627,6 +637,8 @@ class IEEEFloat final : public APFloatBase {
   APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
   APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const;
   APInt convertFloatTF32APFloatToAPInt() const;
+  APInt convertFloat6E3M2FNAPFloatToAPInt() const;
+  APInt convertFloat6E2M3FNAPFloatToAPInt() const;
   void initFromAPInt(const fltSemantics *Sem, const APInt &api);
   template  void initFromIEEEAPInt(const APInt &api);
   void initFromHalfAPInt(const APInt &api);
@@ -642,6 +654,8 @@ class IEEEFloat final : public APFloatBase {
   void initFromFloat8E4M3FNUZAPInt(const APInt &api);
   void initFromFloat8E4M3B11FNUZAPInt(const APInt &api);
   void initFromFloatTF32APInt(const APInt &api);
+  void initFromFloat6E3M2FNAPInt(const APInt &api);
+  void initFromFloat6E2M3FNAPInt(const APInt &api);
 
   void assign(const IEEEFloat &);
   void copySignificand(const IEEEFloat &);
@@ -1039,6 +1053,17 @@ class APFloat : public APFloatBase {
   /// \param Semantics - type float semantics
   static APFloat getAllOnesValue(const fltSemantics &Semantics);
 
+  static bool hasNanOrInf(const fltSemantics &Sem) {
+switch (SemanticsToEnum(Sem)) {
+default:
+  return true;
+// Below Semantics do not support {NaN or Inf}
+case APFloat::S_Float6E3M2FN:
+case APFloat::S_Float6E2M3FN:
+  return false;
+}
+  }
+
   /// Used to insert APFloat objects, or objects that contain APFloat objects,
   /// into FoldingSets.
   void Profile(FoldingSetNodeID &NID) const;
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 283fcc153b33a..04d54ee66309c 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior {
   // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
+
+  // This behavior is present in Float6E3M2FN and Float6E2M3FN types.
+  // There is no representation for Inf or NaN.
+  NoNanInf,
 };
 
 // How NaN values are represented. Thi

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Durgadoss R via cfe-commits


https://github.com/durga4github updated 
https://github.com/llvm/llvm-project/pull/94735

>From 94b25ae304a102cc8c0196f3ca6c460dd4de7026 Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Wed, 5 Jun 2024 19:22:31 +0530
Subject: [PATCH] [APFloat] Add APFloat support for FP6 data types

This patch adds APFloat type support for two FP6
data types, E2M3 and E3M2. The definitions for the
two formats are detailed in section 5.3.2 of the
OCP specification, which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

Signed-off-by: Durgadoss R 
---
 clang/lib/AST/MicrosoftMangle.cpp  |   2 +
 llvm/include/llvm/ADT/APFloat.h|  25 ++
 llvm/lib/Support/APFloat.cpp   |  87 +-
 llvm/unittests/ADT/APFloatTest.cpp | 484 +++--
 4 files changed, 563 insertions(+), 35 deletions(-)

diff --git a/clang/lib/AST/MicrosoftMangle.cpp 
b/clang/lib/AST/MicrosoftMangle.cpp
index 36d611750ca48..72c79dab6bdcc 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -899,6 +899,8 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat 
Number) {
   case APFloat::S_Float8E4M3FNUZ:
   case APFloat::S_Float8E4M3B11FNUZ:
   case APFloat::S_FloatTF32:
+  case APFloat::S_Float6E3M2FN:
+  case APFloat::S_Float6E2M3FN:
 llvm_unreachable("Tried to mangle unexpected APFloat semantics");
   }
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 44a301ecc9928..35d6d88013e9e 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -189,6 +189,14 @@ struct APFloatBase {
 // improved range compared to half (16-bit) formats, at (potentially)
 // greater throughput than single precision (32-bit) formats.
 S_FloatTF32,
+// 6-bit floating point number with bit layout S1E3M2. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float6E3M2FN,
+// 6-bit floating point number with bit layout S1E2M3. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float6E2M3FN,
 
 S_x87DoubleExtended,
 S_MaxSemantics = S_x87DoubleExtended,
@@ -209,6 +217,8 @@ struct APFloatBase {
   static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
   static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
   static const fltSemantics &FloatTF32() LLVM_READNONE;
+  static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
+  static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
   static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -627,6 +637,8 @@ class IEEEFloat final : public APFloatBase {
   APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
   APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const;
   APInt convertFloatTF32APFloatToAPInt() const;
+  APInt convertFloat6E3M2FNAPFloatToAPInt() const;
+  APInt convertFloat6E2M3FNAPFloatToAPInt() const;
   void initFromAPInt(const fltSemantics *Sem, const APInt &api);
   template  void initFromIEEEAPInt(const APInt &api);
   void initFromHalfAPInt(const APInt &api);
@@ -642,6 +654,8 @@ class IEEEFloat final : public APFloatBase {
   void initFromFloat8E4M3FNUZAPInt(const APInt &api);
   void initFromFloat8E4M3B11FNUZAPInt(const APInt &api);
   void initFromFloatTF32APInt(const APInt &api);
+  void initFromFloat6E3M2FNAPInt(const APInt &api);
+  void initFromFloat6E2M3FNAPInt(const APInt &api);
 
   void assign(const IEEEFloat &);
   void copySignificand(const IEEEFloat &);
@@ -1039,6 +1053,17 @@ class APFloat : public APFloatBase {
   /// \param Semantics - type float semantics
   static APFloat getAllOnesValue(const fltSemantics &Semantics);
 
+  static bool hasNanOrInf(const fltSemantics &Sem) {
+switch (SemanticsToEnum(Sem)) {
+default:
+  return true;
+// Below Semantics do not support {NaN or Inf}
+case APFloat::S_Float6E3M2FN:
+case APFloat::S_Float6E2M3FN:
+  return false;
+}
+  }
+
   /// Used to insert APFloat objects, or objects that contain APFloat objects,
   /// into FoldingSets.
   void Profile(FoldingSetNodeID &NID) const;
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 283fcc153b33a..7525077b3fdc3 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior {
   // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
+
+  // This behavior is present in Float6E3M2FN and Float6E2M3FN types,
+  // which do not support Inf or NaN values.
+  NoNanInf,
 };
 
 // How NaN values are represented. This i

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Durgadoss R via cfe-commits



@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior {
   // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
+
+  // This behavior is present in Float6E3M2FN and Float6E2M3FN types.
+  // There is no representation for Inf or NaN.
+  NoNanInf,

durga4github wrote:

Updated the comment in the latest revision.

For the naming:
The MX spec has a few other types but they support either Inf or Nan. So, I 
could not name it generically like "MXType".

I see that "inverting" (and naming) would change the condition-checks in the 
code.
Looking at the places where this is being used (in OR with other conditions), 
it gives better readability if we do not invert.

So, below are a few options that I could think of:
1) Finite
2) FiniteOnly
3) NoNanInf (current)
4) NoNanNoInf
5) SupportsNonFinite (if we strongly prefer inverted conditions and this naming)

Please let me know which one looks more reasonable.

https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Durgadoss R via cfe-commits



@@ -139,6 +143,10 @@ static constexpr fltSemantics semFloat8E4M3FNUZ = {
 static constexpr fltSemantics semFloat8E4M3B11FNUZ = {
 4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
 static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19};
+static constexpr fltSemantics semFloat6E3M2FN = {

durga4github wrote:

Yes, I tried a few other suffixes like "FNOnly", "FNNoNaN" etc. They looked 
weird and lengthy. So, I stayed with the "FN" suffix.

https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-08 Thread Durgadoss R via cfe-commits



@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior {
   // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
+
+  // This behavior is present in Float6E3M2FN and Float6E2M3FN types.
+  // There is no representation for Inf or NaN.
+  NoNanInf,

durga4github wrote:

Thank you, I will refresh with this "FiniteOnly".

https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-08 Thread Durgadoss R via cfe-commits


https://github.com/durga4github updated 
https://github.com/llvm/llvm-project/pull/94735

>From 2ee13938a4428948ae6fdeb82de6e0c15e2dd9f8 Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Wed, 5 Jun 2024 19:22:31 +0530
Subject: [PATCH] [APFloat] Add APFloat support for FP6 data types

This patch adds APFloat type support for two FP6
data types, E2M3 and E3M2. The definitions for the
two formats are detailed in section 5.3.2 of the
OCP specification, which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

Signed-off-by: Durgadoss R 
---
 clang/lib/AST/MicrosoftMangle.cpp  |   2 +
 llvm/include/llvm/ADT/APFloat.h|  25 ++
 llvm/lib/Support/APFloat.cpp   |  87 +-
 llvm/unittests/ADT/APFloatTest.cpp | 484 +++--
 4 files changed, 563 insertions(+), 35 deletions(-)

diff --git a/clang/lib/AST/MicrosoftMangle.cpp 
b/clang/lib/AST/MicrosoftMangle.cpp
index 36d611750ca48..72c79dab6bdcc 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -899,6 +899,8 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat 
Number) {
   case APFloat::S_Float8E4M3FNUZ:
   case APFloat::S_Float8E4M3B11FNUZ:
   case APFloat::S_FloatTF32:
+  case APFloat::S_Float6E3M2FN:
+  case APFloat::S_Float6E2M3FN:
 llvm_unreachable("Tried to mangle unexpected APFloat semantics");
   }
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 44a301ecc9928..35d6d88013e9e 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -189,6 +189,14 @@ struct APFloatBase {
 // improved range compared to half (16-bit) formats, at (potentially)
 // greater throughput than single precision (32-bit) formats.
 S_FloatTF32,
+// 6-bit floating point number with bit layout S1E3M2. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float6E3M2FN,
+// 6-bit floating point number with bit layout S1E2M3. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float6E2M3FN,
 
 S_x87DoubleExtended,
 S_MaxSemantics = S_x87DoubleExtended,
@@ -209,6 +217,8 @@ struct APFloatBase {
   static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
   static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
   static const fltSemantics &FloatTF32() LLVM_READNONE;
+  static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
+  static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
   static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -627,6 +637,8 @@ class IEEEFloat final : public APFloatBase {
   APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
   APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const;
   APInt convertFloatTF32APFloatToAPInt() const;
+  APInt convertFloat6E3M2FNAPFloatToAPInt() const;
+  APInt convertFloat6E2M3FNAPFloatToAPInt() const;
   void initFromAPInt(const fltSemantics *Sem, const APInt &api);
   template  void initFromIEEEAPInt(const APInt &api);
   void initFromHalfAPInt(const APInt &api);
@@ -642,6 +654,8 @@ class IEEEFloat final : public APFloatBase {
   void initFromFloat8E4M3FNUZAPInt(const APInt &api);
   void initFromFloat8E4M3B11FNUZAPInt(const APInt &api);
   void initFromFloatTF32APInt(const APInt &api);
+  void initFromFloat6E3M2FNAPInt(const APInt &api);
+  void initFromFloat6E2M3FNAPInt(const APInt &api);
 
   void assign(const IEEEFloat &);
   void copySignificand(const IEEEFloat &);
@@ -1039,6 +1053,17 @@ class APFloat : public APFloatBase {
   /// \param Semantics - type float semantics
   static APFloat getAllOnesValue(const fltSemantics &Semantics);
 
+  static bool hasNanOrInf(const fltSemantics &Sem) {
+switch (SemanticsToEnum(Sem)) {
+default:
+  return true;
+// Below Semantics do not support {NaN or Inf}
+case APFloat::S_Float6E3M2FN:
+case APFloat::S_Float6E2M3FN:
+  return false;
+}
+  }
+
   /// Used to insert APFloat objects, or objects that contain APFloat objects,
   /// into FoldingSets.
   void Profile(FoldingSetNodeID &NID) const;
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 283fcc153b33a..1209bf71a287d 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior {
   // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
+
+  // This behavior is present in Float6E3M2FN and Float6E2M3FN types,
+  // which do not support Inf or NaN values.
+  FiniteOnly,
 };
 
 // How NaN values are represented. This

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-08 Thread Durgadoss R via cfe-commits



@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior {
   // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
+
+  // This behavior is present in Float6E3M2FN and Float6E2M3FN types.
+  // There is no representation for Inf or NaN.
+  NoNanInf,

durga4github wrote:

@kuhar , I updated the naming to "FiniteOnly" in the latest revision. Please 
let me know if it's good to go.


https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-11 Thread Durgadoss R via cfe-commits


durga4github wrote:

There is one test failure in Codegen/LoongArch/opt-pipeline.ll and it does not 
seem related to my changes here.
So, merging this change.

https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-11 Thread Durgadoss R via cfe-commits


https://github.com/durga4github closed 
https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP4 data type (PR #95392)

2024-06-13 Thread Durgadoss R via cfe-commits


https://github.com/durga4github created 
https://github.com/llvm/llvm-project/pull/95392

This patch adds APFloat type support for the E2M1
FP4 datatype. The definitions for this format are
detailed in section 5.3.3 of the OCP specification,
which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

>From db104d4e7479c704a153ff8cbad7c8568e3ffaf3 Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Wed, 12 Jun 2024 23:55:04 +0530
Subject: [PATCH] [APFloat] Add APFloat support for FP4 data type

This patch adds APFloat type support for the E2M1
FP4 datatype. The definitions for this format are
detailed in section 5.3.3 of the OCP specification,
which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

Signed-off-by: Durgadoss R 
---
 clang/lib/AST/MicrosoftMangle.cpp  |   1 +
 llvm/include/llvm/ADT/APFloat.h|   8 +
 llvm/lib/Support/APFloat.cpp   |  25 ++-
 llvm/unittests/ADT/APFloatTest.cpp | 256 -
 4 files changed, 286 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/MicrosoftMangle.cpp 
b/clang/lib/AST/MicrosoftMangle.cpp
index ffc5d2d4cd8fc..a863ec7a529b9 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -901,6 +901,7 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat 
Number) {
   case APFloat::S_FloatTF32:
   case APFloat::S_Float6E3M2FN:
   case APFloat::S_Float6E2M3FN:
+  case APFloat::S_Float4E2M1FN:
 llvm_unreachable("Tried to mangle unexpected APFloat semantics");
   }
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index a9bb6ccb1..c24eae8da3797 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -197,6 +197,10 @@ struct APFloatBase {
 // types, there are no infinity or NaN values. The format is detailed in
 // 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
 S_Float6E2M3FN,
+// 4-bit floating point number with bit layout S1E2M1. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float4E2M1FN,
 
 S_x87DoubleExtended,
 S_MaxSemantics = S_x87DoubleExtended,
@@ -219,6 +223,7 @@ struct APFloatBase {
   static const fltSemantics &FloatTF32() LLVM_READNONE;
   static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
   static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
+  static const fltSemantics &Float4E2M1FN() LLVM_READNONE;
   static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -639,6 +644,7 @@ class IEEEFloat final : public APFloatBase {
   APInt convertFloatTF32APFloatToAPInt() const;
   APInt convertFloat6E3M2FNAPFloatToAPInt() const;
   APInt convertFloat6E2M3FNAPFloatToAPInt() const;
+  APInt convertFloat4E2M1FNAPFloatToAPInt() const;
   void initFromAPInt(const fltSemantics *Sem, const APInt &api);
   template  void initFromIEEEAPInt(const APInt &api);
   void initFromHalfAPInt(const APInt &api);
@@ -656,6 +662,7 @@ class IEEEFloat final : public APFloatBase {
   void initFromFloatTF32APInt(const APInt &api);
   void initFromFloat6E3M2FNAPInt(const APInt &api);
   void initFromFloat6E2M3FNAPInt(const APInt &api);
+  void initFromFloat4E2M1FNAPInt(const APInt &api);
 
   void assign(const IEEEFloat &);
   void copySignificand(const IEEEFloat &);
@@ -1067,6 +1074,7 @@ class APFloat : public APFloatBase {
 // Below Semantics do not support {NaN or Inf}
 case APFloat::S_Float6E3M2FN:
 case APFloat::S_Float6E2M3FN:
+case APFloat::S_Float4E2M1FN:
   return false;
 }
   }
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 1209bf71a287d..fab3052a9c02e 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -69,8 +69,8 @@ enum class fltNonfiniteBehavior {
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
 
-  // This behavior is present in Float6E3M2FN and Float6E2M3FN types,
-  // which do not support Inf or NaN values.
+  // This behavior is present in Float6E3M2FN, Float6E2M3FN and
+  // Float4E2M1FN types, which do not support Inf or NaN values.
   FiniteOnly,
 };
 
@@ -147,6 +147,8 @@ static constexpr fltSemantics semFloat6E3M2FN = {
 4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly};
 static constexpr fltSemantics semFloat6E2M3FN = {
 2, 0, 4, 6, fltNonfiniteBehavior::FiniteOnly};
+static constexpr fltSemantics semFloat4E2M1FN = {
+2, 0, 2, 4, fltNonfiniteBehavior::FiniteOnly};
 static constexpr fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80};
 static constexpr fltSemantics semBogus = {0, 0, 0, 0};
 
@@ -218,6 +220,8 @@ const llvm::fltSemantics 
&APFloatBase::EnumToSemantics(Semantics S) {

[clang] [llvm] [APFloat] Add APFloat support for FP4 data type (PR #95392)

2024-06-13 Thread Durgadoss R via cfe-commits


https://github.com/durga4github edited 
https://github.com/llvm/llvm-project/pull/95392
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP4 data type (PR #95392)

2024-06-13 Thread Durgadoss R via cfe-commits


durga4github wrote:

@ThomasRaoux, Could you please help review this change?

https://github.com/llvm/llvm-project/pull/95392
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP4 data type (PR #95392)

2024-06-13 Thread Durgadoss R via cfe-commits



@@ -69,8 +69,8 @@ enum class fltNonfiniteBehavior {
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
 
-  // This behavior is present in Float6E3M2FN and Float6E2M3FN types,
-  // which do not support Inf or NaN values.
+  // This behavior is present in Float6E3M2FN, Float6E2M3FN and
+  // Float4E2M1FN types, which do not support Inf or NaN values.

durga4github wrote:

Fixed in the latest revision :-)

https://github.com/llvm/llvm-project/pull/95392
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP4 data type (PR #95392)

2024-06-13 Thread Durgadoss R via cfe-commits



@@ -6907,6 +7028,42 @@ TEST(APFloatTest, ConvertE2M3FToE3M2F) {
   EXPECT_EQ(status, APFloat::opInexact);
 }
 
+TEST(APFloatTest, ConvertDoubleToE2M1F) {
+  bool losesInfo;

durga4github wrote:

Updated this also to be consistent.

https://github.com/llvm/llvm-project/pull/95392
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [APFloat] Add APFloat support for FP4 data type (PR #95392)

2024-06-13 Thread Durgadoss R via cfe-commits


https://github.com/durga4github updated 
https://github.com/llvm/llvm-project/pull/95392

>From af17388ffd5096a0c50b62dbd8073f957c052bb1 Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Wed, 12 Jun 2024 23:55:04 +0530
Subject: [PATCH] [APFloat] Add APFloat support for FP4 data type

This patch adds APFloat type support for the E2M1
FP4 datatype. The definitions for this format are
detailed in section 5.3.3 of the OCP specification,
which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

Signed-off-by: Durgadoss R 
---
 clang/lib/AST/MicrosoftMangle.cpp  |   1 +
 llvm/include/llvm/ADT/APFloat.h|   8 +
 llvm/lib/Support/APFloat.cpp   |  25 ++-
 llvm/unittests/ADT/APFloatTest.cpp | 256 -
 4 files changed, 283 insertions(+), 7 deletions(-)

diff --git a/clang/lib/AST/MicrosoftMangle.cpp 
b/clang/lib/AST/MicrosoftMangle.cpp
index ffc5d2d4cd8fc..a863ec7a529b9 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -901,6 +901,7 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat 
Number) {
   case APFloat::S_FloatTF32:
   case APFloat::S_Float6E3M2FN:
   case APFloat::S_Float6E2M3FN:
+  case APFloat::S_Float4E2M1FN:
 llvm_unreachable("Tried to mangle unexpected APFloat semantics");
   }
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index a9bb6ccb1..c24eae8da3797 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -197,6 +197,10 @@ struct APFloatBase {
 // types, there are no infinity or NaN values. The format is detailed in
 // 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
 S_Float6E2M3FN,
+// 4-bit floating point number with bit layout S1E2M1. Unlike IEEE-754
+// types, there are no infinity or NaN values. The format is detailed in
+// 
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+S_Float4E2M1FN,
 
 S_x87DoubleExtended,
 S_MaxSemantics = S_x87DoubleExtended,
@@ -219,6 +223,7 @@ struct APFloatBase {
   static const fltSemantics &FloatTF32() LLVM_READNONE;
   static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
   static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
+  static const fltSemantics &Float4E2M1FN() LLVM_READNONE;
   static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -639,6 +644,7 @@ class IEEEFloat final : public APFloatBase {
   APInt convertFloatTF32APFloatToAPInt() const;
   APInt convertFloat6E3M2FNAPFloatToAPInt() const;
   APInt convertFloat6E2M3FNAPFloatToAPInt() const;
+  APInt convertFloat4E2M1FNAPFloatToAPInt() const;
   void initFromAPInt(const fltSemantics *Sem, const APInt &api);
   template  void initFromIEEEAPInt(const APInt &api);
   void initFromHalfAPInt(const APInt &api);
@@ -656,6 +662,7 @@ class IEEEFloat final : public APFloatBase {
   void initFromFloatTF32APInt(const APInt &api);
   void initFromFloat6E3M2FNAPInt(const APInt &api);
   void initFromFloat6E2M3FNAPInt(const APInt &api);
+  void initFromFloat4E2M1FNAPInt(const APInt &api);
 
   void assign(const IEEEFloat &);
   void copySignificand(const IEEEFloat &);
@@ -1067,6 +1074,7 @@ class APFloat : public APFloatBase {
 // Below Semantics do not support {NaN or Inf}
 case APFloat::S_Float6E3M2FN:
 case APFloat::S_Float6E2M3FN:
+case APFloat::S_Float4E2M1FN:
   return false;
 }
   }
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 1209bf71a287d..47618bc325951 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -69,8 +69,8 @@ enum class fltNonfiniteBehavior {
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
 
-  // This behavior is present in Float6E3M2FN and Float6E2M3FN types,
-  // which do not support Inf or NaN values.
+  // This behavior is present in Float6E3M2FN, Float6E2M3FN, and
+  // Float4E2M1FN types, which do not support Inf or NaN values.
   FiniteOnly,
 };
 
@@ -147,6 +147,8 @@ static constexpr fltSemantics semFloat6E3M2FN = {
 4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly};
 static constexpr fltSemantics semFloat6E2M3FN = {
 2, 0, 4, 6, fltNonfiniteBehavior::FiniteOnly};
+static constexpr fltSemantics semFloat4E2M1FN = {
+2, 0, 2, 4, fltNonfiniteBehavior::FiniteOnly};
 static constexpr fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80};
 static constexpr fltSemantics semBogus = {0, 0, 0, 0};
 
@@ -218,6 +220,8 @@ const llvm::fltSemantics 
&APFloatBase::EnumToSemantics(Semantics S) {
 return Float6E3M2FN();
   case S_Float6E2M3FN:
 return Float6E2M3FN();
+  case S_Float4E2M1FN:
+return Float4E2M1FN();
   case S_x87DoubleExtended:
 return x87DoubleExtended();
   }
@@ -254,6 +258,8 @@ APFloatBase::SemanticsToEnum(const llvm::fltSeman

[clang] [llvm] [APFloat] Add APFloat support for FP4 data type (PR #95392)

2024-06-14 Thread Durgadoss R via cfe-commits


https://github.com/durga4github closed 
https://github.com/llvm/llvm-project/pull/95392
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [llvm][NFC] `APFloat`: Add missing semantics to enum (PR #117291)

2024-11-22 Thread Durgadoss R via cfe-commits


durga4github wrote:

Hi @matthias-springer ,

Can we split this into at least two separate PRs?

One for the first two items in the commit message.
And one (or two) PRs for the rest of the changes.


https://github.com/llvm/llvm-project/pull/117291
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add tcgen05 alloc/dealloc intrinsics (PR #124961)

2025-02-04 Thread Durgadoss R via cfe-commits


https://github.com/durga4github closed 
https://github.com/llvm/llvm-project/pull/124961
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add tcgen05 alloc/dealloc intrinsics (PR #124961)

2025-01-31 Thread Durgadoss R via cfe-commits


https://github.com/durga4github updated 
https://github.com/llvm/llvm-project/pull/124961

>From 467c3a41badb66b9187864a040c9eeccef1b583c Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Wed, 29 Jan 2025 16:31:06 +0530
Subject: [PATCH] [NVPTX] Add tcgen05 alloc/dealloc intrinsics

This patch adds intrinsics for the tcgen05
alloc/dealloc family of PTX instructions.

This patch also adds addrspace 6 for tensor memory
which is used by these intrinsics.

lit tests are added and verified with a ptxas-12.8
executable.

Documentation for these additions is also added in
NVPTXUsage.rst.

Signed-off-by: Durgadoss R 
---
 clang/lib/Basic/Targets/NVPTX.cpp|   9 +-
 clang/test/CodeGen/target-data.c |   4 +-
 llvm/docs/NVPTXUsage.rst |  98 ++
 llvm/include/llvm/IR/IntrinsicsNVVM.td   |  30 +
 llvm/include/llvm/Support/NVPTXAddrSpace.h   |   1 +
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td  |   1 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td |  41 ++
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h   |  15 +++
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp |   3 +
 llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll | 131 +++
 10 files changed, 327 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll

diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index a03f4983b9d0384..017146a9ada14a3 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -62,12 +62,13 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
   HasFloat16 = true;
 
   if (TargetPointerWidth == 32)
-resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
-  else if (Opts.NVPTXUseShortPointers)
 resetDataLayout(
-
"e-p3:32:32-p4:32:32-p5:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
+"e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
+  else if (Opts.NVPTXUseShortPointers)
+
resetDataLayout("e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:"
+"16-v32:32-n16:32:64");
   else
-resetDataLayout("e-i64:64-i128:128-v16:16-v32:32-n16:32:64");
+resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
 
   // If possible, get a TargetInfo for our host triple, so we can match its
   // types.
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 71eb849433ed40d..fe29aadb1dd532f 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,11 +160,11 @@
 
 // RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=NVPTX
-// NVPTX: target datalayout = 
"e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+// NVPTX: target datalayout = 
"e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 
 // RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=NVPTX64
-// NVPTX64: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+// NVPTX64: target datalayout = 
"e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 
 // RUN: %clang_cc1 -triple r600-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=R600
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 64dd2b84a1763e7..dec6ad4e541152a 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -962,6 +962,104 @@ The ``griddepcontrol`` intrinsics allows the dependent 
grids and prerequisite gr
 For more information, refer 
 `PTX ISA 
`__.
 
+TCGEN05 family of Intrinsics
+
+
+The llvm.nvvm.tcgen05.* intrinsics model the TCGEN05 family of instructions
+exposed by PTX. These intrinsics use 'Tensor Memory' (henceforth ``tmem``).
+NVPTX represents this memory using ``addrspace(6)`` and is always 32-bits.
+
+For more information, refer to the PTX ISA
+``_.
+
+The tensor-memory pointers may only be used with the tcgen05 intrinsics.
+There are specialized load/store instructions provided (tcgen05.ld/st) to
+work with tensor-memory.
+
+See the PTX ISA for more information on tensor-memory load/store instructions
+``_.
+
+'``llvm.nvvm.tcgen05.alloc``'
+^
+
+Syntax:
+"""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %dst, i32 %ncols)
+  declare void @llvm.nvvm.tcgen05.alloc.cg2(ptr %dst, i32 %ncols)
+  declare void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %dst, i32 
%ncols)
+  declare void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %dst, i32 
%ncols)
+
+Overview:
+"
+
+The '``@llvm.nvvm.tcgen05.alloc.*``' intrins

[clang] [llvm] [NVPTX] Add tcgen05 alloc/dealloc intrinsics (PR #124961)

2025-01-31 Thread Durgadoss R via cfe-commits



@@ -962,6 +962,109 @@ The ``griddepcontrol`` intrinsics allows the dependent 
grids and prerequisite gr
 For more information, refer 
 `PTX ISA 
`__.
 
+TCGEN05 family of Intrinsics
+
+
+The llvm.nvvm.tcgen05.* intrinsics model the TCGEN05 family of instructions
+exposed by PTX. These intrinsics use 'Tensor Memory' (henceforth ``tmem``).
+NVPTX represents this memory using ``addrspace(6)`` and is always 32-bits.
+
+For more information, refer PTX ISA
+``_.
+
+The tensor-memory pointers may only be used with the tcgen05 intrinsics.
+There are specialized load/store instructions provided (tcgen05.ld/st) to
+work with tensor-memory.
+
+For more information on tensor-memory load/store instructions, refer
+``_.
+
+All tcgen05 intrinsics use a ``null`` pointer in tmem address
+space as their last operand. This helps to preserve ordering among the tcgen05
+operations especially when the intrinsic lacks any tmem operands. This
+last operand is dropped during Codegen.

durga4github wrote:

> After reading PTX docs here's my understanding of the situation.
> 
> * there's a new kind of memory, so creating a separate AS for tmem is 
> reasonable.
> * tcgen05.alloc returns allocation result indirectly, by storing it in a 
> shared memory. So LLVM has no direct indication that the intrinsic operates 
> on tmem and affects both shared memory and tmem
> * it's not clear from PTX docs what's the input for tcgen05.dealloc. It just 
> says "The operand taddr must point to a previous [Tensor 
> Memory](https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory)
>  allocation" but I can't tell if that means the previous location in the 
> shared memory where it stored a tmem pointer, or the tmem pointer itself. 
> Judging by the proposed intrinsic signature, it's the latter. In this case 
> LLVM knows that we're touching tmem.
> * relinquish_alloc_permit blocks subsequent allocations, so it must not be 
> reordered vs allocs.
> 
> So, the only odd thing is the allocation returning the result indirectly.
> 
> Proposed design adds artificial tmem pointer to let LLVM know that all 
> tcgen05 intrinsics operate on tmem and we can give LLVM sufficient hints on 
> how they should be ordered. However, the dummy argument is a crutch.
> 
> The gist of the problem here is that LLVM's existing intrinsic annotation is 
> not flexible enough to describe what we have here, exactly. I.e. there's no 
> way to tell LLVM that alloc and relinquish_alloc_permit operate on tmem. Our 
> current 

Yes, this is precisely what we meant to deal with.

options are to either make all intrinsics conservatively with `HasSideEffects` 
or, with a more relaxed "IntrInaccessibleMemOnly". I think the latter would be 
a reasonable trade-off for the time being.
> 

Sure. I have updated the alloc/relinq intrinsic to use InaccessibleMem* 
properties, in the latest revision.

> A longer-term approach would be to add a new intrinsic property allowing to 
> specify specific AS accessed by the intrinsic. E.g. we may extend existing 
> `IntrWriteMem` and `IntrWriteMem` to allow narrowing the scope to particular 
> AS, and allow specifying more than one. E.g. alloc would indicate that it 
> writes both shared and tmem. I think that would be a useful addition to a 
> handful of other intrinsics we already have, not just in NVPTX, but in the 
> other back-ends that need to deal with multiple AS.

Yes, cannot agree more on this..



https://github.com/llvm/llvm-project/pull/124961
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add intrinsics for redux.sync f32 instructions (PR #126664)

2025-02-11 Thread Durgadoss R via cfe-commits


https://github.com/durga4github approved this pull request.

The latest revision looks good to me.

https://github.com/llvm/llvm-project/pull/126664
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add intrinsics for redux.sync f32 instructions (PR #126664)

2025-02-11 Thread Durgadoss R via cfe-commits



@@ -328,6 +328,24 @@ defm REDUX_SYNC_AND : REDUX_SYNC<"and", "b32", 
int_nvvm_redux_sync_and>;
 defm REDUX_SYNC_XOR : REDUX_SYNC<"xor", "b32", int_nvvm_redux_sync_xor>;
 defm REDUX_SYNC_OR : REDUX_SYNC<"or", "b32", int_nvvm_redux_sync_or>;
 
+multiclass REDUX_SYNC_F {
+  def : NVPTXInst<(outs Float32Regs:$dst),
+  (ins Float32Regs:$src, Int32Regs:$mask),
+  "redux.sync." # !tolower(BinOp) # !subst("_", ".", ABS) # 
!subst("_", ".", NAN) # ".f32 $dst, $src, $mask;",

durga4github wrote:

we do not need tolower

https://github.com/llvm/llvm-project/pull/126664
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add intrinsics for redux.sync f32 instructions (PR #126664)

2025-02-11 Thread Durgadoss R via cfe-commits



@@ -328,6 +328,24 @@ defm REDUX_SYNC_AND : REDUX_SYNC<"and", "b32", 
int_nvvm_redux_sync_and>;
 defm REDUX_SYNC_XOR : REDUX_SYNC<"xor", "b32", int_nvvm_redux_sync_xor>;
 defm REDUX_SYNC_OR : REDUX_SYNC<"or", "b32", int_nvvm_redux_sync_or>;
 
+multiclass REDUX_SYNC_F {
+  def : NVPTXInst<(outs Float32Regs:$dst),
+  (ins Float32Regs:$src, Int32Regs:$mask),
+  "redux.sync." # !tolower(BinOp) # !subst("_", ".", ABS) # 
!subst("_", ".", NAN) # ".f32 $dst, $src, $mask;",
+  [(set f32:$dst, (Intrin f32:$src, Int32Regs:$mask))]>,
+  Requires<[hasPTX<86>, hasSM100a]>; 
+ 
+}
+
+defm REDUX_SYNC_FMIN : REDUX_SYNC_F<"min", "", "", int_nvvm_redux_sync_fmin>;
+defm REDUX_SYNC_FMIN_ABS : REDUX_SYNC_F<"min", "_abs", "", 
int_nvvm_redux_sync_fmin_abs>;
+defm REDUX_SYNC_FMIN_NAN: REDUX_SYNC_F<"min", "", "_NaN", 
int_nvvm_redux_sync_fmin_NaN>;
+defm REDUX_SYNC_FMIN_ABS_NAN: REDUX_SYNC_F<"min", "_abs", "_NaN", 
int_nvvm_redux_sync_fmin_abs_NaN>;
+defm REDUX_SYNC_FMAX : REDUX_SYNC_F<"max", "", "", int_nvvm_redux_sync_fmax>;
+defm REDUX_SYNC_FMAX_ABS : REDUX_SYNC_F<"max", "_abs", "", 
int_nvvm_redux_sync_fmax_abs>;
+defm REDUX_SYNC_FMAX_NAN: REDUX_SYNC_F<"max", "", "_NaN", 
int_nvvm_redux_sync_fmax_NaN>;
+defm REDUX_SYNC_FMAX_ABS_NAN: REDUX_SYNC_F<"max", "_abs", "_NaN", 
int_nvvm_redux_sync_fmax_abs_NaN>;

durga4github wrote:

I believe, we could easily construct the intrinsic from a cast in the 
multiclass itself.

https://github.com/llvm/llvm-project/pull/126664
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add intrinsics for redux.sync f32 instructions (PR #126664)

2025-02-11 Thread Durgadoss R via cfe-commits



@@ -1,11 +1,13 @@
-// RUN: %clang_cc1 "-triple" "nvptx-nvidia-cuda" "-target-feature" "+ptx70" 
"-target-cpu" "sm_80" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
-// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx70" 
"-target-cpu" "sm_80" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+// RUN: %clang_cc1 "-triple" "nvptx-nvidia-cuda" "-target-feature" "+ptx86" 
"-target-cpu" "sm_100a" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx86" 
"-target-cpu" "sm_100a" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s

durga4github wrote:

Let us keep the existing file and the tests intact. We need them for 
ptx70/sm_80.

Can we add another redux-f32-builtins.cu file with only the new additions from 
this change?

https://github.com/llvm/llvm-project/pull/126664
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add tcgen05 alloc/dealloc intrinsics (PR #124961)

2025-01-30 Thread Durgadoss R via cfe-commits


https://github.com/durga4github updated 
https://github.com/llvm/llvm-project/pull/124961

>From 632fc53beebac1d77d33c1f46893f2c868b35313 Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Wed, 29 Jan 2025 16:31:06 +0530
Subject: [PATCH] [NVPTX] Add tcgen05 alloc/dealloc intrinsics

This patch adds intrinsics for the tcgen05
alloc/dealloc family of PTX instructions.

This patch also adds addrspace 6 for tensor memory
which is used by these intrinsics.

lit tests are added and verified with a ptxas-12.8
executable.

Documentation for these additions is also added in
NVPTXUsage.rst.

Signed-off-by: Durgadoss R 
---
 clang/lib/Basic/Targets/NVPTX.cpp|   9 +-
 clang/test/CodeGen/target-data.c |   4 +-
 llvm/docs/NVPTXUsage.rst | 103 ++
 llvm/include/llvm/IR/IntrinsicsNVVM.td   |  34 +
 llvm/include/llvm/Support/NVPTXAddrSpace.h   |   1 +
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td  |   1 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td |  42 ++
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h   |  15 ++
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp |   3 +
 llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll | 139 +++
 10 files changed, 345 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll

diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index a03f4983b9d038..017146a9ada14a 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -62,12 +62,13 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
   HasFloat16 = true;
 
   if (TargetPointerWidth == 32)
-resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
-  else if (Opts.NVPTXUseShortPointers)
 resetDataLayout(
-
"e-p3:32:32-p4:32:32-p5:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
+"e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
+  else if (Opts.NVPTXUseShortPointers)
+
resetDataLayout("e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:"
+"16-v32:32-n16:32:64");
   else
-resetDataLayout("e-i64:64-i128:128-v16:16-v32:32-n16:32:64");
+resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
 
   // If possible, get a TargetInfo for our host triple, so we can match its
   // types.
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 71eb849433ed40..fe29aadb1dd532 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,11 +160,11 @@
 
 // RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=NVPTX
-// NVPTX: target datalayout = 
"e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+// NVPTX: target datalayout = 
"e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 
 // RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=NVPTX64
-// NVPTX64: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+// NVPTX64: target datalayout = 
"e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 
 // RUN: %clang_cc1 -triple r600-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=R600
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 64dd2b84a1763e..083d85bbedae5b 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -962,6 +962,109 @@ The ``griddepcontrol`` intrinsics allows the dependent 
grids and prerequisite gr
 For more information, refer 
 `PTX ISA 
`__.
 
+TCGEN05 family of Intrinsics
+
+
+The llvm.nvvm.tcgen05.* intrinsics model the TCGEN05 family of instructions
+exposed by PTX. These intrinsics use 'Tensor Memory' (henceforth ``tmem``).
+NVPTX represents this memory using ``addrspace(6)`` and is always 32-bits.
+
+For more information, refer to the PTX ISA
+``_.
+
+The tensor-memory pointers may only be used with the tcgen05 intrinsics.
+There are specialized load/store instructions provided (tcgen05.ld/st) to
+work with tensor-memory.
+
+See the PTX ISA for more information on tensor-memory load/store instructions
+``_.
+
+All tcgen05 intrinsics use a ``null`` pointer in tmem address
+space as their last operand. This helps to preserve ordering among the tcgen05
+operations especially when the intrinsic lacks any tmem operands. This
+last operand is dropped during Codegen.
+
+'``llvm.nvvm.tcgen05.alloc``'
+^
+
+Syntax:
+"""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %dst, i32 %ncols, ptr 
addrspace(6) null)
+  declare void @llvm.nvvm.tcgen05.allo

[clang] [llvm] [NVPTX] Add tcgen05 alloc/dealloc intrinsics (PR #124961)

2025-01-30 Thread Durgadoss R via cfe-commits


https://github.com/durga4github updated 
https://github.com/llvm/llvm-project/pull/124961

>From bfe728f879b5a20be2269c6d9e52c1feb0cce64b Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Wed, 29 Jan 2025 16:31:06 +0530
Subject: [PATCH] [NVPTX] Add tcgen05 alloc/dealloc intrinsics

This patch adds intrinsics for the tcgen05
alloc/dealloc family of PTX instructions.

This patch also adds addrspace 6 for tensor memory
which is used by these intrinsics.

lit tests are added and verified with a ptxas-12.8
executable.

Documentation for these additions is also added in
NVPTXUsage.rst.

Signed-off-by: Durgadoss R 
---
 clang/lib/Basic/Targets/NVPTX.cpp|   6 +-
 clang/test/CodeGen/target-data.c |   4 +-
 llvm/docs/NVPTXUsage.rst | 103 ++
 llvm/include/llvm/IR/IntrinsicsNVVM.td   |  34 +
 llvm/include/llvm/Support/NVPTXAddrSpace.h   |   1 +
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td  |   1 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td |  42 ++
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h   |  15 ++
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp |   3 +
 llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll | 139 +++
 10 files changed, 343 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll

diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index a03f4983b9d038..168c28114b3053 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -62,12 +62,12 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
   HasFloat16 = true;
 
   if (TargetPointerWidth == 32)
-resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
+
resetDataLayout("e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
   else if (Opts.NVPTXUseShortPointers)
 resetDataLayout(
-
"e-p3:32:32-p4:32:32-p5:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
+
"e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
   else
-resetDataLayout("e-i64:64-i128:128-v16:16-v32:32-n16:32:64");
+resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
 
   // If possible, get a TargetInfo for our host triple, so we can match its
   // types.
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 71eb849433ed40..fe29aadb1dd532 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,11 +160,11 @@
 
 // RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=NVPTX
-// NVPTX: target datalayout = 
"e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+// NVPTX: target datalayout = 
"e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 
 // RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=NVPTX64
-// NVPTX64: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+// NVPTX64: target datalayout = 
"e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 
 // RUN: %clang_cc1 -triple r600-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=R600
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 64dd2b84a1763e..083d85bbedae5b 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -962,6 +962,109 @@ The ``griddepcontrol`` intrinsics allows the dependent 
grids and prerequisite gr
 For more information, refer 
 `PTX ISA 
`__.
 
+TCGEN05 family of Intrinsics
+
+
+The llvm.nvvm.tcgen05.* intrinsics model the TCGEN05 family of instructions
+exposed by PTX. These intrinsics use 'Tensor Memory' (henceforth ``tmem``).
+NVPTX represents this memory using ``addrspace(6)`` and is always 32-bits.
+
+For more information, refer to the PTX ISA
+``_.
+
+The tensor-memory pointers may only be used with the tcgen05 intrinsics.
+There are specialized load/store instructions provided (tcgen05.ld/st) to
+work with tensor-memory.
+
+See the PTX ISA for more information on tensor-memory load/store instructions
+``_.
+
+All tcgen05 intrinsics use a ``null`` pointer in tmem address
+space as their last operand. This helps to preserve ordering among the tcgen05
+operations especially when the intrinsic lacks any tmem operands. This
+last operand is dropped during Codegen.
+
+'``llvm.nvvm.tcgen05.alloc``'
+^
+
+Syntax:
+"""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %dst, i32 %ncols, ptr 
addrspace(6) null)
+  declare void @llvm.nvvm.tcgen05.alloc.cg2(ptr %dst, i32 %ncols, ptr 
addrspace(6) null)
+  declare

[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)

2025-01-19 Thread Durgadoss R via cfe-commits



@@ -682,6 +688,9 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const 
llvm::Triple &Triple,
   case CudaVersion::CUDA_##CUDA_VER:   
\
 PtxFeature = "+ptx" #PTX_VER;  
\
 break;
+CASE_CUDA_VERSION(129, 86);
+CASE_CUDA_VERSION(128, 86);
+CASE_CUDA_VERSION(127, 85);

durga4github wrote:

On rechecking this, we have 12.7 mapping to PTX 86 (and 12.8 to PTX 87).
Could you please check this once?

https://github.com/llvm/llvm-project/pull/123398
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] Remove incorrect CUDA defines (PR #123898)

2025-01-22 Thread Durgadoss R via cfe-commits


https://github.com/durga4github approved this pull request.

Changes look good to me. Let us wait for Artem's review

https://github.com/llvm/llvm-project/pull/123898
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)

2025-01-17 Thread Durgadoss R via cfe-commits


https://github.com/durga4github approved this pull request.

The updates look good to me.

https://github.com/llvm/llvm-project/pull/123398
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [StrTable] Mechanically convert NVPTX builtins to use TableGen (PR #122873)

2025-01-27 Thread Durgadoss R via cfe-commits


durga4github wrote:

LGTM overall. I work with these builtins only occasionally. So, let us wait for 
Artem's review.

https://github.com/llvm/llvm-project/pull/122873
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [StrTable] Mechanically convert NVPTX builtins to use TableGen (PR #122873)

2025-01-27 Thread Durgadoss R via cfe-commits


durga4github wrote:

> Ping!
> 
> I've updated this to incorporate the changes in #123398 to the NVPTX.def file 
> this is replacing.
> 
Thanks for this!





https://github.com/llvm/llvm-project/pull/122873
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [StrTable] Mechanically convert NVPTX builtins to use TableGen (PR #122873)

2025-01-27 Thread Durgadoss R via cfe-commits


https://github.com/durga4github edited 
https://github.com/llvm/llvm-project/pull/122873
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [StrTable] Mechanically convert NVPTX builtins to use TableGen (PR #122873)

2025-01-27 Thread Durgadoss R via cfe-commits



@@ -0,0 +1,1078 @@
+//===--- BuiltinsNVPTX.td - NVPTX Builtin function defs -*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This file defines the PTX-specific builtin function database.
+//
+//===--===//
+
+include "clang/Basic/BuiltinsBase.td"
+
+class SMFeatures {
+  string Features;
+}
+
+class SM newer_list> : SMFeatures {
+  let Features = !foldl(!strconcat("sm_", version), newer_list, f, newer,
+!strconcat(f, "|", newer.Features));
+}
+
+let Features = "sm_100a" in def SM_100a : SMFeatures;
+
+def SM_100 : SM<"100", [SM_100a]>;
+
+let Features = "sm_90a" in def SM_90a : SMFeatures;
+
+def SM_90 : SM<"90", [SM_90a, SM_100]>;
+def SM_89 : SM<"89", [SM_90]>;
+def SM_87 : SM<"87", [SM_89]>;
+def SM_86 : SM<"86", [SM_87]>;
+def SM_80 : SM<"80", [SM_86]>;
+def SM_75 : SM<"75", [SM_80]>;
+def SM_72 : SM<"72", [SM_75]>;
+def SM_70 : SM<"70", [SM_72]>;
+def SM_62 : SM<"62", [SM_70]>;
+def SM_61 : SM<"61", [SM_62]>;
+def SM_60 : SM<"60", [SM_61]>;
+def SM_53 : SM<"53", [SM_60]>;
+
+class PTXFeatures {
+  string Features;
+}
+
+class PTX : PTXFeatures {
+  let Features = !strconcat("ptx", version, "|", newer.Features);
+}
+
+let Features = "ptx86" in def PTX86 : PTXFeatures;
+
+def PTX85 : PTX<"85", PTX86>;
+def PTX84 : PTX<"84", PTX85>;
+def PTX83 : PTX<"83", PTX84>;
+def PTX82 : PTX<"82", PTX83>;
+def PTX81 : PTX<"81", PTX82>;
+def PTX80 : PTX<"80", PTX81>;
+def PTX78 : PTX<"78", PTX80>;
+def PTX77 : PTX<"77", PTX78>;
+def PTX76 : PTX<"76", PTX77>;
+def PTX75 : PTX<"75", PTX76>;
+def PTX74 : PTX<"74", PTX75>;
+def PTX73 : PTX<"73", PTX74>;
+def PTX72 : PTX<"72", PTX73>;
+def PTX71 : PTX<"71", PTX72>;
+def PTX70 : PTX<"70", PTX71>;
+def PTX65 : PTX<"65", PTX70>;
+def PTX64 : PTX<"64", PTX65>;
+def PTX63 : PTX<"63", PTX64>;
+def PTX62 : PTX<"62", PTX63>;
+def PTX61 : PTX<"61", PTX62>;
+def PTX60 : PTX<"60", PTX61>;
+def PTX42 : PTX<"42", PTX60>;
+
+class NVPTXBuiltin : TargetBuiltin {
+  let Spellings = [NAME];
+  let Prototype = prototype;
+}
+
+class NVPTXBuiltinSM : 
NVPTXBuiltin {
+  let Features = sm.Features;
+}
+
+class NVPTXBuiltinPTX : 
NVPTXBuiltin {
+  let Features = ptx.Features;
+}
+
+class NVPTXBuiltinSMAndPTX : 
NVPTXBuiltin {
+  let Features = !strconcat("(", sm.Features, "),(", ptx.Features, ")");
+}
+
+// Special Registers
+
+let Attributes = [NoThrow, Const] in {
+  def __nvvm_read_ptx_sreg_tid_x : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_tid_y : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_tid_z : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_tid_w : NVPTXBuiltin<"int()">;
+
+  def __nvvm_read_ptx_sreg_ntid_x : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_ntid_y : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_ntid_z : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_ntid_w : NVPTXBuiltin<"int()">;
+
+  def __nvvm_read_ptx_sreg_ctaid_x : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_ctaid_y : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_ctaid_z : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_ctaid_w : NVPTXBuiltin<"int()">;
+
+  def __nvvm_read_ptx_sreg_nctaid_x : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_nctaid_y : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_nctaid_z : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_nctaid_w : NVPTXBuiltin<"int()">;
+
+  def __nvvm_read_ptx_sreg_clusterid_x : NVPTXBuiltinSMAndPTX<"int()", SM_90, 
PTX78>;
+  def __nvvm_read_ptx_sreg_clusterid_y : NVPTXBuiltinSMAndPTX<"int()", SM_90, 
PTX78>;
+  def __nvvm_read_ptx_sreg_clusterid_z : NVPTXBuiltinSMAndPTX<"int()", SM_90, 
PTX78>;
+  def __nvvm_read_ptx_sreg_clusterid_w : NVPTXBuiltinSMAndPTX<"int()", SM_90, 
PTX78>;
+
+  def __nvvm_read_ptx_sreg_nclusterid_x : NVPTXBuiltinSMAndPTX<"int()", SM_90, 
PTX78>;
+  def __nvvm_read_ptx_sreg_nclusterid_y : NVPTXBuiltinSMAndPTX<"int()", SM_90, 
PTX78>;
+  def __nvvm_read_ptx_sreg_nclusterid_z : NVPTXBuiltinSMAndPTX<"int()", SM_90, 
PTX78>;
+  def __nvvm_read_ptx_sreg_nclusterid_w : NVPTXBuiltinSMAndPTX<"int()", SM_90, 
PTX78>;
+
+  def __nvvm_read_ptx_sreg_cluster_ctaid_x : NVPTXBuiltinSMAndPTX<"int()", 
SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_cluster_ctaid_y : NVPTXBuiltinSMAndPTX<"int()", 
SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_cluster_ctaid_z : NVPTXBuiltinSMAndPTX<"int()", 
SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_cluster_ctaid_w : NVPTXBuiltinSMAndPTX<"int()", 
SM_90, PTX78>;
+
+  def __nvvm_read_ptx_sreg_cluster_nctaid_x : NVPTXBuiltinSMAndPTX<"int()", 
SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_cluster_nctaid_y : NVPTXBuiltinSMAndPTX<"int()", 
SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_cluster_nctaid_z :

[clang] [llvm] [mlir] [NVPTX] Switch front-ends and tests to ptx_kernel cc (PR #120806)

2024-12-23 Thread Durgadoss R via cfe-commits



@@ -556,19 +556,16 @@ llvm.func @kernel_func() attributes {nvvm.kernel} {
   llvm.return
 }
 
-// CHECK: !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK: {ptr @kernel_func, !"kernel", i32 1}
+// CHECK: ptx_kernel void @kernel_func

durga4github wrote:

So, downstream (non-upstream ones) users should migrate to this cc-based 
instead of kernel-metadata based, right?
I am asking since this does not seem to be an opt-in/optional change.

Alex, Could you please clarify?

https://github.com/llvm/llvm-project/pull/120806
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Switch front-ends and tests to ptx_kernel cc (PR #120806)

2024-12-23 Thread Durgadoss R via cfe-commits



@@ -556,19 +556,16 @@ llvm.func @kernel_func() attributes {nvvm.kernel} {
   llvm.return
 }
 
-// CHECK: !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK: {ptr @kernel_func, !"kernel", i32 1}
+// CHECK: ptx_kernel void @kernel_func

durga4github wrote:

ok, thanks for clarifying!

https://github.com/llvm/llvm-project/pull/120806
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants (PR #134345)

2025-04-05 Thread Durgadoss R via cfe-commits


https://github.com/durga4github edited 
https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] cuda clang: Fix argument order for __reduce_max_sync (PR #132881)

2025-03-26 Thread Durgadoss R via cfe-commits


https://github.com/durga4github approved this pull request.

The latest changes LGTM

https://github.com/llvm/llvm-project/pull/132881
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Add support for Shared Cluster Memory address space. (PR #135444)

2025-04-14 Thread Durgadoss R via cfe-commits



@@ -2034,13 +2038,15 @@ multiclass F_ATOMIC_2_AS, 
preds>;
   defm _S : F_ATOMIC_2, 
preds>;
+  defm _S_C : F_ATOMIC_2, !listconcat([hasSM<80>], preds)>;
   defm _GEN : F_ATOMIC_2, 
preds>;
 }
 
 multiclass F_ATOMIC_3_AS preds = []> {
   defvar frag_pat = (frag node:$a, node:$b, node:$c);
   defm _G : F_ATOMIC_3, preds>;
   defm _S : F_ATOMIC_3, preds>;
+  defm _S_C : F_ATOMIC_3, !listconcat([hasSM<80>], preds)>;

durga4github wrote:

Did you mean hasSM<90> here?
(May be it is right, I am just trying to get it clarified for myself)

https://github.com/llvm/llvm-project/pull/135444
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Add support for Shared Cluster Memory address space. (PR #135444)

2025-04-14 Thread Durgadoss R via cfe-commits



@@ -982,8 +982,9 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
 case ADDRESS_SPACE_SHARED:
   Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
   break;
-case ADDRESS_SPACE_DSHARED:
-  Opc = TM.is64Bit() ? NVPTX::cvta_dshared_64 : NVPTX::cvta_dshared;
+case ADDRESS_SPACE_SHARED_CLUSTER:
+  Opc = TM.is64Bit() ? NVPTX::cvta_shared_cluster_64
+ : NVPTX::cvta_shared_cluster;

durga4github wrote:

okay, I believe we still support 32-bit for shared-memory pointers, even in 
sm_90+?

https://github.com/llvm/llvm-project/pull/135444
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Add support for Shared Cluster Memory address space. (PR #135444)

2025-04-14 Thread Durgadoss R via cfe-commits


https://github.com/durga4github closed 
https://github.com/llvm/llvm-project/pull/135444
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Add support for Shared Cluster Memory address space. (PR #135444)

2025-04-14 Thread Durgadoss R via cfe-commits


durga4github wrote:

(Sorry I clicked the wrong button `Close` instead of `Comment`)

https://github.com/llvm/llvm-project/pull/135444
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Add support for Shared Cluster Memory address space. (PR #135444)

2025-04-14 Thread Durgadoss R via cfe-commits


https://github.com/durga4github reopened 
https://github.com/llvm/llvm-project/pull/135444
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Add support for Shared Cluster Memory address space. (PR #135444)

2025-04-14 Thread Durgadoss R via cfe-commits



@@ -4704,6 +4754,43 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function 
*NewFn) {
 CI->eraseFromParent();
 return;
   }
+  case Intrinsic::nvvm_mapa_shared_cluster: {
+// Create a new call with the correct address space.
+NewCall =
+Builder.CreateCall(NewFn, {CI->getArgOperand(0), 
CI->getArgOperand(1)});
+Value *Res = NewCall;
+Res = Builder.CreateAddrSpaceCast(
+Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
+Res = Builder.CreateAddrSpaceCast(
+Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED));

durga4github wrote:

Should this be shared_cluster?

https://github.com/llvm/llvm-project/pull/135444
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Add support for Shared Cluster Memory address space. (PR #135444)

2025-04-14 Thread Durgadoss R via cfe-commits


durga4github wrote:

A general thought,
Can we include the base changes in this PR and create a separate PR for the 
intrinsics-migration+MLIR changes?

https://github.com/llvm/llvm-project/pull/135444
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Add support for Shared Cluster Memory address space. (PR #135444)

2025-04-14 Thread Durgadoss R via cfe-commits



@@ -117,13 +117,15 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void 
LLVMInitializeNVPTXTarget() {
 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
   std::string Ret = "e";
 
-  if (!is64Bit)
-Ret += "-p:32:32";
-  else if (UseShortPointers)
-Ret += "-p3:32:32-p4:32:32-p5:32:32";
-
   // Tensor Memory (addrspace:6) is always 32-bits.
-  Ret += "-p6:32:32";
+  // Distributed Shared Memory (addrspace:7) follows shared memory
+  // (addrspace:3).
+  if (!is64Bit)
+Ret += "-p:32:32-p6:32:32-p7:32:32";
+  else if (UseShortPointers) {
+Ret += "-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32";
+  } else
+Ret += "-p6:32:32";

durga4github wrote:

We need the comment from line 120 here. Please retain it as is.

https://github.com/llvm/llvm-project/pull/135444
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Add support for Shared Cluster Memory address space. (PR #135444)

2025-04-14 Thread Durgadoss R via cfe-commits



@@ -43,7 +43,10 @@ enum NVVMMemorySpace {
   /// Tensor memory space identifier.
   /// Tensor memory is available only in arch-accelerated
   /// variants from sm100 onwards.
-  kTensorMemorySpace = 6
+  kTensorMemorySpace = 6,
+  /// Distributed shared memory space identifier.
+  /// Distributed shared memory is available only in sm80+.

durga4github wrote:

We have sm_86, sm_87 etc. that do not support dsmem.
So, let us say "sm 90 onwards" here.

https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#memory-hierarchy


https://github.com/llvm/llvm-project/pull/135444
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Add support for Shared Cluster Memory address space. (PR #135444)

2025-04-14 Thread Durgadoss R via cfe-commits



@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc < %s -o - -mcpu=sm_90 -march=nvptx64 -mattr=+ptx80 | FileCheck %s
+; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| 
%ptxas-verify -arch=sm_90 %}

durga4github wrote:

and we only need +ptx78 here..

https://github.com/llvm/llvm-project/pull/135444
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add intrinsics for redux.sync f32 instructions (PR #126664)

2025-02-13 Thread Durgadoss R via cfe-commits


durga4github wrote:

Merging as per offline request

https://github.com/llvm/llvm-project/pull/126664
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add intrinsics for redux.sync f32 instructions (PR #126664)

2025-02-13 Thread Durgadoss R via cfe-commits


https://github.com/durga4github closed 
https://github.com/llvm/llvm-project/pull/126664
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants (PR #134345)

2025-04-04 Thread Durgadoss R via cfe-commits



@@ -1548,6 +1548,45 @@ let TargetPrefix = "nvvm" in {
   Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
   def int_nvvm_e5m2x2_to_f16x2_rn_relu : 
ClangBuiltin<"__nvvm_e5m2x2_to_f16x2_rn_relu">,
   Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
+  
+  def int_nvvm_ff_to_e2m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e2m3x2_rn">,
+  Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, 
IntrNoCallback]>;
+  def int_nvvm_ff_to_e2m3x2_rn_relu : 
ClangBuiltin<"__nvvm_ff_to_e2m3x2_rn_relu">,
+  Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, 
IntrNoCallback]>;
+  def int_nvvm_ff_to_e3m2x2_rn : ClangBuiltin<"__nvvm_ff_to_e3m2x2_rn">,
+ Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, 
IntrNoCallback]>;
+  def int_nvvm_ff_to_e3m2x2_rn_relu : 
ClangBuiltin<"__nvvm_ff_to_e3m2x2_rn_relu">,
+  Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, 
IntrNoCallback]>;
+
+  def int_nvvm_e2m3x2_to_f16x2_rn : ClangBuiltin<"__nvvm_e2m3x2_to_f16x2_rn">,
+  Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_e2m3x2_to_f16x2_rn_relu : 
ClangBuiltin<"__nvvm_e2m3x2_to_f16x2_rn_relu">,
+  Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_e3m2x2_to_f16x2_rn : ClangBuiltin<"__nvvm_e3m2x2_to_f16x2_rn">,
+  Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_e3m2x2_to_f16x2_rn_relu : 
ClangBuiltin<"__nvvm_e3m2x2_to_f16x2_rn_relu">,
+  Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;

durga4github wrote:

optional:
We may combine this chunk with the e4m3/e5m2 conversions above since they all 
take an i16 and return v2f16, to reduce the verbosity. But this can be a 
separate NFC change.

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants (PR #134345)

2025-04-04 Thread Durgadoss R via cfe-commits


durga4github wrote:

Change looks good to me overall. Let us wait for Artem's review.

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add intrinsics and clang builtins for conversions of f4x2 type (PR #139244)

2025-05-09 Thread Durgadoss R via cfe-commits



@@ -1663,6 +1663,13 @@ let TargetPrefix = "nvvm" in {
   def int_nvvm_ # type # _to_f16x2 # suffix : CVT_I16_TO_F16X2;
 }
   }
+
+  // FP4 conversions.
+  foreach relu = ["", "_relu"] in {
+defvar suffix = !strconcat("_rn", relu);

durga4github wrote:

nit: since it is only two variables, may be using # is simpler.

https://github.com/llvm/llvm-project/pull/139244
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add intrinsics and clang builtins for conversions of f4x2 type (PR #139244)

2025-05-09 Thread Durgadoss R via cfe-commits


https://github.com/durga4github approved this pull request.


https://github.com/llvm/llvm-project/pull/139244
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add intrinsics and clang builtins for conversions of f4x2 type (PR #139244)

2025-05-09 Thread Durgadoss R via cfe-commits


durga4github wrote:

Changes LGTM. Let us wait for Artem's review.

https://github.com/llvm/llvm-project/pull/139244
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Unify and extend barrier{.cta} intrinsic support (PR #140615)

2025-05-20 Thread Durgadoss R via cfe-commits


https://github.com/durga4github approved this pull request.


https://github.com/llvm/llvm-project/pull/140615
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Unify and extend barrier{.cta} intrinsic support (PR #140615)

2025-05-20 Thread Durgadoss R via cfe-commits



@@ -102,39 +93,51 @@ def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), 
(ins Int32Regs:$pred),
  "}}"),
   [(set i32:$dst, (int_nvvm_barrier0_or i32:$pred))]>;
 
-def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync \t$i;",
- [(int_nvvm_bar_sync imm:$i)]>;
-
 def INT_BAR_WARP_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "bar.warp.sync 
\t$i;",
  [(int_nvvm_bar_warp_sync imm:$i)]>,
 Requires<[hasPTX<60>, hasSM<30>]>;
 def INT_BAR_WARP_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "bar.warp.sync 
\t$i;",
  [(int_nvvm_bar_warp_sync i32:$i)]>,
 Requires<[hasPTX<60>, hasSM<30>]>;
 
-def INT_BARRIER_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "barrier.sync 
\t$i;",
-   [(int_nvvm_barrier_sync imm:$i)]>,
-Requires<[hasPTX<60>, hasSM<30>]>;
-def INT_BARRIER_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "barrier.sync 
\t$i;",
-   [(int_nvvm_barrier_sync i32:$i)]>,
-Requires<[hasPTX<60>, hasSM<30>]>;
+multiclass BARRIER1 
requires = []> {
+  def _i : BasicNVPTXInst<(outs), (ins i32imm:$i), asmstr,
+  [(intrinsic imm:$i)]>,
+   Requires;
 
-def INT_BARRIER_SYNC_CNT_RR : NVPTXInst<(outs), (ins Int32Regs:$id, 
Int32Regs:$cnt),
- "barrier.sync \t$id, $cnt;",
- [(int_nvvm_barrier_sync_cnt i32:$id, i32:$cnt)]>,
-Requires<[hasPTX<60>, hasSM<30>]>;
-def INT_BARRIER_SYNC_CNT_RI : NVPTXInst<(outs), (ins Int32Regs:$id, 
i32imm:$cnt),
- "barrier.sync \t$id, $cnt;",
- [(int_nvvm_barrier_sync_cnt i32:$id, imm:$cnt)]>,
-Requires<[hasPTX<60>, hasSM<30>]>;
-def INT_BARRIER_SYNC_CNT_IR : NVPTXInst<(outs), (ins i32imm:$id, 
Int32Regs:$cnt),
- "barrier.sync \t$id, $cnt;",
- [(int_nvvm_barrier_sync_cnt imm:$id, i32:$cnt)]>,
-Requires<[hasPTX<60>, hasSM<30>]>;
-def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
- "barrier.sync \t$id, $cnt;",
- [(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
-Requires<[hasPTX<60>, hasSM<30>]>;
+  def _r : BasicNVPTXInst<(outs), (ins Int32Regs:$i), asmstr,
+  [(intrinsic i32:$i)]>,
+   Requires;
+}
+
+multiclass BARRIER2 
requires = []> {
+  def _rr : BasicNVPTXInst<(outs), (ins Int32Regs:$i, Int32Regs:$j), asmstr,
+  [(intrinsic i32:$i, i32:$j)]>,
+Requires;
+
+  def _ri : BasicNVPTXInst<(outs), (ins Int32Regs:$i, i32imm:$j), asmstr,
+  [(intrinsic i32:$i, imm:$j)]>,
+Requires;
+
+  def _ir : BasicNVPTXInst<(outs), (ins i32imm:$i, Int32Regs:$j), asmstr,
+  [(intrinsic imm:$i, i32:$j)]>,
+Requires;
+
+  def _ii : BasicNVPTXInst<(outs), (ins i32imm:$i, i32imm:$j), asmstr,
+  [(intrinsic imm:$i, imm:$j)]>,
+Requires;
+}
+
+// Note the "bar.sync" variants could be renamed to the equivalent 
corresponding
+// "barrier.*.aligned" variants. We use the older syntax for compatibility with
+// older versions of the PTX ISA.

durga4github wrote:

Yes, and thanks for this note!

https://github.com/llvm/llvm-project/pull/140615
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Unify and extend barrier{.cta} intrinsic support (PR #140615)

2025-05-20 Thread Durgadoss R via cfe-commits



@@ -462,24 +462,28 @@ def NVVM_MBarrierTestWaitSharedOp : 
NVVM_Op<"mbarrier.test.wait.shared">,
 // NVVM synchronization op definitions
 
//===--===//
 
-def NVVM_Barrier0Op : NVVM_IntrOp<"barrier0"> {
+def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
   let assemblyFormat = "attr-dict";
+  string llvmBuilder = [{
+  createIntrinsicCall(
+  builder, llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all,
+  {builder.getInt32(0)});
+  }];

durga4github wrote:

Sure, Alex. I will take care of this in a separate change.

https://github.com/llvm/llvm-project/pull/140615
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Unify and extend barrier{.cta} intrinsic support (PR #140615)

2025-05-20 Thread Durgadoss R via cfe-commits



@@ -240,6 +240,47 @@ def BF16RT   : RegTyInfo;
 def F16X2RT  : RegTyInfo;
 def BF16X2RT : RegTyInfo;
 
+// This class provides a basic wrapper around an NVPTXInst that abstracts the
+// specific syntax of most PTX instructions. It automatically handles the
+// construction of the asm string based on the provided dag arguments.
+// For example, the following asm-strings would be computed:
+//
+//   * BasicFlagsNVPTXInst<(outs Int32Regs:$dst),
+// (ins Int32Regs:$a, Int32Regs:$b), (ins),
+// "add.s32">;
+// ---> "add.s32 \t$dst, $a, $b;"
+//
+//   * BasicFlagsNVPTXInst<(outs Int32Regs:$d),
+// (ins Int32Regs:$a, Int32Regs:$b, Hexu32imm:$c),
+// (ins PrmtMode:$mode),
+// "prmt.b32${mode}">;
+// ---> "prmt.b32${mode} \t$dst, $a, $b, $c;"

durga4github wrote:

I think you meant `$d` here, for the output value

https://github.com/llvm/llvm-project/pull/140615
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Unify and extend barrier{.cta} intrinsic support (PR #140615)

2025-05-20 Thread Durgadoss R via cfe-commits



@@ -199,21 +199,58 @@ map in the following way to CUDA builtins:
 Barriers
 
 
-'``llvm.nvvm.barrier0``'
-^^^
+'``llvm.nvvm.barrier.cta.*``'
+^
 
 Syntax:
 """
 
 .. code-block:: llvm
 
-  declare void @llvm.nvvm.barrier0()
+  declare void @llvm.nvvm.barrier.cta.sync(i32 %id, i32 %n)
+  declare void @llvm.nvvm.barrier.cta.sync.all(i32 %id)
+  declare void @llvm.nvvm.barrier.cta.arrive(i32 %id, i32 %n)
+
+  declare void @llvm.nvvm.barrier.cta.sync.aligned(i32 %id, i32 %n)
+  declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %id)
+  declare void @llvm.nvvm.barrier.cta.arrive.aligned(i32 %id, i32 %n)
 
 Overview:
 "
 
-The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
-instruction, equivalent to the ``__syncthreads()`` call in CUDA.
+The '``@llvm.nvvm.barrier.cta.*``' family of intrinsics perform barrier
+synchronization and communication within a CTA. They can be used by the threads
+within the CTA for synchronization and communication.
+
+Semantics:
+""
+
+Operand %id specifies a logical barrier resource and must fall within the range
+0 through 15. When present, operand %n specifies the number of threads
+participating in the barrier. When specifying a thread count, the value must be
+a multiple of the warp size. With the '``@llvm.nvvm.barrier.cta.sync.*``'
+variants, the '``.all``' suffix indicates that all threads in the CTA should
+participate in the barrier and the %n operand is not present.
+
+All forms of the '``@llvm.nvvm.barrier.cta.*``' intrinsic cause the executing
+thread to wait for all non-exited threads from its warp and then marks the
+warp's arrival at the barrier. In addition to signaling its arrival at the 
+barrier, the '``@llvm.nvvm.barrier.cta.sync.*``' intrinsics cause the executing
+thread to wait for non-exited threads of all other warps participating in the
+barrier to arrive. On the other hand, the '``@llvm.nvvm.barrier.cta.arrive.*``'
+intrinsic does not cause the executing thread to wait for threads of other
+participating warps.
+
+When a barrier completes, the waiting threads are restarted without delay,
+and the barrier is reinitialized so that it can be immediately reused.
+
+The '``@llvm.nvvm.barrier.cta.*``' intrinsic has an optional '``.aligned``'
+modifier to indicate textual alignment of the barrier. When specified, it
+indicates that all threads in the CTA will execute the same
+'``@llvm.nvvm.barrier.cta.*``' instruction. In conditionally executed code, an
+aligned '``@llvm.nvvm.barrier.cta.*``' instruction should only be used if it is
+known that all threads in the CTA evaluate the condition identically, otherwise
+behavior is undefined.

durga4github wrote:

Shall we add a link to the PTX ISA here?

https://github.com/llvm/llvm-project/pull/140615
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [NVPTX] Unify and extend barrier{.cta} intrinsic support (PR #140615)

2025-05-20 Thread Durgadoss R via cfe-commits



@@ -462,24 +462,28 @@ def NVVM_MBarrierTestWaitSharedOp : 
NVVM_Op<"mbarrier.test.wait.shared">,
 // NVVM synchronization op definitions
 
//===--===//
 
-def NVVM_Barrier0Op : NVVM_IntrOp<"barrier0"> {
+def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
   let assemblyFormat = "attr-dict";
+  string llvmBuilder = [{
+  createIntrinsicCall(
+  builder, llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all,
+  {builder.getInt32(0)});
+  }];

durga4github wrote:

+1.

https://github.com/llvm/llvm-project/pull/140615
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add pm_event intrinsics (PR #141278)

2025-05-26 Thread Durgadoss R via cfe-commits


https://github.com/durga4github edited 
https://github.com/llvm/llvm-project/pull/141278
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add pm_event intrinsics (PR #141278)

2025-05-26 Thread Durgadoss R via cfe-commits



@@ -1868,6 +1868,38 @@ If the request failed, the behavior of these intrinsics 
is undefined.
 
 For more information, refer `PTX ISA 
`__.
 
+Perf Monitor Event Intrinsics
+-
+
+'``llvm.nvvm.pm.event.[idx|mask]``' Intrinsics
+^^
+
+Syntax:
+"""
+
+.. code-block:: llvm
+
+declare void @llvm.nvvm.pm.event.idx(i32 immarg %idx_val)

durga4github wrote:

Thanks for the prompt on this, Alex!

I checked this and sass only has the `mask` variant.
The `immediate` variant is only a syntactic sugar from ptx and ptxas internally 
converts it to the `mask` variant.

So, I have removed the `idx` variant, and we only have the `mask` variant in 
the latest revision now.

https://github.com/llvm/llvm-project/pull/141278
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add pm_event intrinsics (PR #141278)

2025-05-26 Thread Durgadoss R via cfe-commits


https://github.com/durga4github updated 
https://github.com/llvm/llvm-project/pull/141278

>From 0fc21a165a6f9202b441d1d8c4afa1252f9d6cc6 Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Fri, 23 May 2025 20:43:18 +0530
Subject: [PATCH] [NVPTX] Add pm_event intrinsics

This patch adds pm_event.mask intrinsic and its
clang-builtin.

Signed-off-by: Durgadoss R 
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  1 +
 clang/test/CodeGen/builtins-nvptx.c|  7 +++
 llvm/docs/NVPTXUsage.rst   | 23 ++
 llvm/include/llvm/IR/IntrinsicsNVVM.td |  5 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   | 10 ++
 llvm/test/CodeGen/NVPTX/pm-event.ll| 15 ++
 6 files changed, 61 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/pm-event.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 2cea44e224674..3e479a3d62dd8 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -177,6 +177,7 @@ let Attributes = [NoReturn] in {
 }
 let Attributes = [NoThrow] in {
   def __nvvm_nanosleep : NVPTXBuiltinSMAndPTX<"void(unsigned int)", SM_70, 
PTX63>;
+  def __nvvm_pm_event_mask : NVPTXBuiltin<"void(unsigned short)">;
 }
 
 // Min Max
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index cef529163bb39..f994adb14e457 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -883,6 +883,13 @@ __device__ void nvvm_vote(int pred) {
   // CHECK: ret void
 }
 
+// CHECK-LABEL: nvvm_pm_event_mask
+__device__ void nvvm_pm_event_mask() {
+  // CHECK: call void @llvm.nvvm.pm.event.mask(i16 255)
+  __nvvm_pm_event_mask(255);
+  // CHECK: ret void
+}
+
 // CHECK-LABEL: nvvm_nanosleep
 __device__ void nvvm_nanosleep(int d) {
 #if __CUDA_ARCH__ >= 700
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 8bb0f2ed17c32..d51686c0b830c 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -1868,6 +1868,29 @@ If the request failed, the behavior of these intrinsics 
is undefined.
 
 For more information, refer `PTX ISA 
`__.
 
+Perf Monitor Event Intrinsics
+-
+
+'``llvm.nvvm.pm.event.mask``' Intrinsic
+^^^
+
+Syntax:
+"""
+
+.. code-block:: llvm
+
+declare void @llvm.nvvm.pm.event.mask(i16 immarg %mask_val)
+
+Overview:
+"
+
+The '``llvm.nvvm.pm.event.mask``' intrinsic triggers one or more
+performance monitor events. Each bit in the 16-bit immediate operand
+``%mask_val`` controls an event.
+
+For more information on the pmevent instructions, refer to the PTX ISA
+``_.
+
 Other Intrinsics
 
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 91e7d188c8533..8c8e778b57061 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -768,6 +768,11 @@ let TargetPrefix = "nvvm" in {
   DefaultAttrsIntrinsic<[], [llvm_i32_ty],
 [IntrConvergent, IntrNoMem, IntrHasSideEffects]>;
 
+  // Performance Monitor Events (pm events) intrinsics
+  def int_nvvm_pm_event_mask : NVVMBuiltin,
+  DefaultAttrsIntrinsic<[], [llvm_i16_ty],
+[IntrConvergent, IntrNoMem, IntrHasSideEffects,
+ ImmArg>]>;
 //
 // Min Max
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td 
b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 8fb5884fa2a20..71da857841c95 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1052,6 +1052,16 @@ def INT_NVVM_NANOSLEEP_I : NVPTXInst<(outs), (ins 
i32imm:$i), "nanosleep.u32 \t$
 def INT_NVVM_NANOSLEEP_R : NVPTXInst<(outs), (ins Int32Regs:$i), 
"nanosleep.u32 \t$i;",
  [(int_nvvm_nanosleep i32:$i)]>,
 Requires<[hasPTX<63>, hasSM<70>]>;
+
+let hasSideEffects = 1 in {
+// Performance Monitor events
+def INT_PM_EVENT_MASK : BasicNVPTXInst<(outs),
+(ins i16imm:$mask),
+"pmevent.mask",
+[(int_nvvm_pm_event_mask timm:$mask)]>,
+Requires<[hasSM<20>, hasPTX<30>]>;
+} // hasSideEffects
+
 //
 // Min Max
 //
diff --git a/llvm/test/CodeGen/NVPTX/pm-event.ll 
b/llvm/test/CodeGen/NVPTX/pm-event.ll
new file mode 100644
index 0..871da6d414978
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/pm-event.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+
+declare void @llvm.nvvm.pm.e

[clang] [llvm] [NVPTX] Add pm_event intrinsics (PR #141278)

2025-05-26 Thread Durgadoss R via cfe-commits



@@ -7172,6 +7172,17 @@ defm INT_SET_MAXNREG_DEC : SET_MAXNREG<"dec", 
int_nvvm_setmaxnreg_dec_sync_align
 
 } // isConvergent
 
+let hasSideEffects = 1 in {
+// Performance Monitor events
+def INT_PM_EVENT_IDX :  NVPTXInst<(outs), (ins i32imm:$idx),

durga4github wrote:

Yes, fixed in the latest revision

https://github.com/llvm/llvm-project/pull/141278
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add pm_event intrinsics (PR #141278)

2025-05-26 Thread Durgadoss R via cfe-commits



@@ -768,6 +768,18 @@ let TargetPrefix = "nvvm" in {
   DefaultAttrsIntrinsic<[], [llvm_i32_ty],
 [IntrConvergent, IntrNoMem, IntrHasSideEffects]>;
 
+  // Performance Monitor Events (pm events) intrinsics
+  // The imm-argument to the _idx variant must be
+  // within the range [0, 16).
+  def int_nvvm_pm_event_idx : NVVMBuiltin,

durga4github wrote:

sorry for missing it,
I have added the clang-builtin and its test in the latest revision.

https://github.com/llvm/llvm-project/pull/141278
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add pm_event intrinsics (PR #141278)

2025-05-26 Thread Durgadoss R via cfe-commits


https://github.com/durga4github edited 
https://github.com/llvm/llvm-project/pull/141278
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add pm_event intrinsics (PR #141278)

2025-05-27 Thread Durgadoss R via cfe-commits



@@ -177,6 +177,7 @@ let Attributes = [NoReturn] in {
 }
 let Attributes = [NoThrow] in {
   def __nvvm_nanosleep : NVPTXBuiltinSMAndPTX<"void(unsigned int)", SM_70, 
PTX63>;
+  def __nvvm_pm_event_mask : NVPTXBuiltin<"void(unsigned short)">;

durga4github wrote:

Sure, fixed in the latest revision.

https://github.com/llvm/llvm-project/pull/141278
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Add pm_event intrinsics (PR #141278)

2025-05-27 Thread Durgadoss R via cfe-commits


https://github.com/durga4github updated 
https://github.com/llvm/llvm-project/pull/141278

>From ba3a94813adcb77262eb0085911e68e558c8 Mon Sep 17 00:00:00 2001
From: Durgadoss R 
Date: Fri, 23 May 2025 20:43:18 +0530
Subject: [PATCH] [NVPTX] Add pm_event intrinsics

This patch adds pm_event.mask intrinsic and its
clang-builtin.

Signed-off-by: Durgadoss R 
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  1 +
 clang/test/CodeGen/builtins-nvptx.c|  7 +++
 llvm/docs/NVPTXUsage.rst   | 23 ++
 llvm/include/llvm/IR/IntrinsicsNVVM.td |  5 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   | 10 ++
 llvm/test/CodeGen/NVPTX/pm-event.ll| 15 ++
 6 files changed, 61 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/pm-event.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 2cea44e224674..6e531eff6dd1d 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -177,6 +177,7 @@ let Attributes = [NoReturn] in {
 }
 let Attributes = [NoThrow] in {
   def __nvvm_nanosleep : NVPTXBuiltinSMAndPTX<"void(unsigned int)", SM_70, 
PTX63>;
+  def __nvvm_pm_event_mask : NVPTXBuiltin<"void(_Constant unsigned short)">;
 }
 
 // Min Max
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index cef529163bb39..f994adb14e457 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -883,6 +883,13 @@ __device__ void nvvm_vote(int pred) {
   // CHECK: ret void
 }
 
+// CHECK-LABEL: nvvm_pm_event_mask
+__device__ void nvvm_pm_event_mask() {
+  // CHECK: call void @llvm.nvvm.pm.event.mask(i16 255)
+  __nvvm_pm_event_mask(255);
+  // CHECK: ret void
+}
+
 // CHECK-LABEL: nvvm_nanosleep
 __device__ void nvvm_nanosleep(int d) {
 #if __CUDA_ARCH__ >= 700
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 8bb0f2ed17c32..d51686c0b830c 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -1868,6 +1868,29 @@ If the request failed, the behavior of these intrinsics 
is undefined.
 
 For more information, refer `PTX ISA 
`__.
 
+Perf Monitor Event Intrinsics
+-
+
+'``llvm.nvvm.pm.event.mask``' Intrinsic
+^^^
+
+Syntax:
+"""
+
+.. code-block:: llvm
+
+declare void @llvm.nvvm.pm.event.mask(i16 immarg %mask_val)
+
+Overview:
+"
+
+The '``llvm.nvvm.pm.event.mask``' intrinsic triggers one or more
+performance monitor events. Each bit in the 16-bit immediate operand
+``%mask_val`` controls an event.
+
+For more information on the pmevent instructions, refer to the PTX ISA
+``_.
+
 Other Intrinsics
 
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 91e7d188c8533..8c8e778b57061 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -768,6 +768,11 @@ let TargetPrefix = "nvvm" in {
   DefaultAttrsIntrinsic<[], [llvm_i32_ty],
 [IntrConvergent, IntrNoMem, IntrHasSideEffects]>;
 
+  // Performance Monitor Events (pm events) intrinsics
+  def int_nvvm_pm_event_mask : NVVMBuiltin,
+  DefaultAttrsIntrinsic<[], [llvm_i16_ty],
+[IntrConvergent, IntrNoMem, IntrHasSideEffects,
+ ImmArg>]>;
 //
 // Min Max
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td 
b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 8fb5884fa2a20..71da857841c95 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1052,6 +1052,16 @@ def INT_NVVM_NANOSLEEP_I : NVPTXInst<(outs), (ins 
i32imm:$i), "nanosleep.u32 \t$
 def INT_NVVM_NANOSLEEP_R : NVPTXInst<(outs), (ins Int32Regs:$i), 
"nanosleep.u32 \t$i;",
  [(int_nvvm_nanosleep i32:$i)]>,
 Requires<[hasPTX<63>, hasSM<70>]>;
+
+let hasSideEffects = 1 in {
+// Performance Monitor events
+def INT_PM_EVENT_MASK : BasicNVPTXInst<(outs),
+(ins i16imm:$mask),
+"pmevent.mask",
+[(int_nvvm_pm_event_mask timm:$mask)]>,
+Requires<[hasSM<20>, hasPTX<30>]>;
+} // hasSideEffects
+
 //
 // Min Max
 //
diff --git a/llvm/test/CodeGen/NVPTX/pm-event.ll 
b/llvm/test/CodeGen/NVPTX/pm-event.ll
new file mode 100644
index 0..871da6d414978
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/pm-event.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+
+declare void @llvm

[clang] [llvm] [NVPTX] Add pm_event intrinsics (PR #141278)

2025-06-06 Thread Durgadoss R via cfe-commits


https://github.com/durga4github closed 
https://github.com/llvm/llvm-project/pull/141278
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

75 matches

Mail list logo