[llvm-branch-commits] mimplid->mimpid (PR #116745)

2024-11-18 Thread Pengcheng Wang via llvm-branch-commits

https://github.com/wangpc-pp created 
https://github.com/llvm/llvm-project/pull/116745

None


___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port SpillPlacement analysis to NPM (PR #116618)

2024-11-18 Thread via llvm-branch-commits

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:



You can test this locally with the following command:


``bash
git-clang-format --diff 8a5db30a3841b88ccac2c781d933eeb45560fdfa 
2dc76a68ef3d64d656b537206ad892dae1759415 --extensions cpp,h -- 
llvm/include/llvm/InitializePasses.h llvm/lib/CodeGen/RegAllocGreedy.cpp 
llvm/lib/CodeGen/RegAllocGreedy.h llvm/lib/CodeGen/SpillPlacement.cpp 
llvm/lib/Passes/PassBuilder.cpp llvm/include/llvm/CodeGen/SpillPlacement.h
``





View the diff from clang-format here.


``diff
diff --git a/llvm/include/llvm/CodeGen/SpillPlacement.h 
b/llvm/include/llvm/CodeGen/SpillPlacement.h
index c114acb1d0..90167d3362 100644
--- a/llvm/include/llvm/CodeGen/SpillPlacement.h
+++ b/llvm/include/llvm/CodeGen/SpillPlacement.h
@@ -163,7 +163,7 @@ public:
 
 private:
   // Only for use by legacy pass manager.
-  SpillPlacement() : nodes(nullptr, &arrayDeleter){};
+  SpillPlacement() : nodes(nullptr, &arrayDeleter) {};
 
   void releaseMemory() {
 nodes.reset();

``




https://github.com/llvm/llvm-project/pull/116618
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port SpillPlacement analysis to NPM (PR #116618)

2024-11-18 Thread Akshat Oke via llvm-branch-commits

https://github.com/optimisan updated 
https://github.com/llvm/llvm-project/pull/116618

>From c791eaa8768073b3ef770a59859346a859bd7a7f Mon Sep 17 00:00:00 2001
From: Akshat Oke 
Date: Mon, 18 Nov 2024 12:42:00 +
Subject: [PATCH 1/2] [CodeGen][NewPM] Port SpillPlacement analysis to NPM

---
 llvm/include/llvm/InitializePasses.h |  2 +-
 llvm/lib/CodeGen/RegAllocGreedy.cpp  |  6 +-
 llvm/lib/CodeGen/SpillPlacement.cpp  | 91 ++--
 llvm/lib/CodeGen/SpillPlacement.h| 52 +---
 4 files changed, 104 insertions(+), 47 deletions(-)

diff --git a/llvm/include/llvm/InitializePasses.h 
b/llvm/include/llvm/InitializePasses.h
index fb8356b9c98cb9..728b178e0cdad7 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -289,7 +289,7 @@ void initializeSinkingLegacyPassPass(PassRegistry &);
 void initializeSjLjEHPreparePass(PassRegistry &);
 void initializeSlotIndexesWrapperPassPass(PassRegistry &);
 void initializeSpeculativeExecutionLegacyPassPass(PassRegistry &);
-void initializeSpillPlacementPass(PassRegistry &);
+void initializeSpillPlacementWrapperLegacyPass(PassRegistry &);
 void initializeStackColoringLegacyPass(PassRegistry &);
 void initializeStackFrameLayoutAnalysisPassPass(PassRegistry &);
 void initializeStackMapLivenessPass(PassRegistry &);
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp 
b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 3542bfe18af46f..3fdf2d6e07a75f 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -162,7 +162,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(EdgeBundlesWrapperLegacy)
-INITIALIZE_PASS_DEPENDENCY(SpillPlacement)
+INITIALIZE_PASS_DEPENDENCY(SpillPlacementWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
 INITIALIZE_PASS_DEPENDENCY(RegAllocEvictionAdvisorAnalysis)
 INITIALIZE_PASS_DEPENDENCY(RegAllocPriorityAdvisorAnalysis)
@@ -217,7 +217,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired();
   AU.addPreserved();
   AU.addRequired();
-  AU.addRequired();
+  AU.addRequired();
   AU.addRequired();
   AU.addRequired();
   AU.addRequired();
@@ -2731,7 +2731,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   ORE = &getAnalysis().getORE();
   Loops = &getAnalysis().getLI();
   Bundles = &getAnalysis().getEdgeBundles();
-  SpillPlacer = &getAnalysis();
+  SpillPlacer = &getAnalysis().getResult();
   DebugVars = &getAnalysis();
 
   initializeCSRCost();
diff --git a/llvm/lib/CodeGen/SpillPlacement.cpp 
b/llvm/lib/CodeGen/SpillPlacement.cpp
index 318e2b19322bb4..c9baabf6161d3a 100644
--- a/llvm/lib/CodeGen/SpillPlacement.cpp
+++ b/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -44,17 +44,17 @@ using namespace llvm;
 
 #define DEBUG_TYPE "spill-code-placement"
 
-char SpillPlacement::ID = 0;
+char SpillPlacementWrapperLegacy::ID = 0;
 
-char &llvm::SpillPlacementID = SpillPlacement::ID;
+char &llvm::SpillPlacementID = SpillPlacementWrapperLegacy::ID;
 
-INITIALIZE_PASS_BEGIN(SpillPlacement, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(SpillPlacementWrapperLegacy, DEBUG_TYPE,
   "Spill Code Placement Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(EdgeBundlesWrapperLegacy)
-INITIALIZE_PASS_END(SpillPlacement, DEBUG_TYPE,
+INITIALIZE_PASS_END(SpillPlacementWrapperLegacy, DEBUG_TYPE,
 "Spill Code Placement Analysis", true, true)
 
-void SpillPlacement::getAnalysisUsage(AnalysisUsage &AU) const {
+void SpillPlacementWrapperLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired();
   AU.addRequiredTransitive();
@@ -189,32 +189,57 @@ struct SpillPlacement::Node {
   }
 };
 
-bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
+bool SpillPlacementWrapperLegacy::runOnMachineFunction(MachineFunction &MF) {
+  auto *Bundles = &getAnalysis().getEdgeBundles();
+  auto *MBFI = &getAnalysis().getMBFI();
+
+  Impl.reset(new SpillPlacement(Bundles, MBFI));
+  Impl->run(MF);
+  return false;
+}
+
+AnalysisKey SpillPlacementAnalysis::Key;
+
+SpillPlacement
+SpillPlacementAnalysis::run(MachineFunction &MF,
+MachineFunctionAnalysisManager &MFAM) {
+  auto *Bundles = &MFAM.getResult(MF);
+  auto *MBFI = &MFAM.getResult(MF);
+  SpillPlacement Impl(Bundles, MBFI);
+  Impl.run(MF);
+  return Impl;
+}
+
+bool SpillPlacementAnalysis::Result::invalidate(
+MachineFunction &MF, const PreservedAnalyses &PA,
+MachineFunctionAnalysisManager::Invalidator &Inv) {
+  auto PAC = PA.getChecker();
+  return !(PAC.preserved() ||
+   PAC.preservedSet>()) ||
+ Inv.invalidate(MF, PA) ||
+ Inv.invalidate(MF, PA);
+}
+
+void SpillPlacement::arrayDeleter(Node *N) {
+  if (N)
+delete[] N;
+}
+
+void SpillPlacement::run(MachineFunction &mf) {
   MF = &m

[llvm-branch-commits] [openmp] release/19.x: [OpenMP] Create versioned libgomp softlinks (#112973) (PR #115944)

2024-11-18 Thread Tobias Hieta via llvm-branch-commits

tru wrote:

I think it makes more sense to do this change in 20.x instead of 19.x.

https://github.com/llvm/llvm-project/pull/115944
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [RISCV] Support __builtin_cpu_is (PR #116231)

2024-11-18 Thread Pengcheng Wang via llvm-branch-commits

https://github.com/wangpc-pp updated 
https://github.com/llvm/llvm-project/pull/116231

>From 9686a2c5c5276289e72d9098f497a9f246a1c457 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng 
Date: Thu, 14 Nov 2024 22:06:45 +0800
Subject: [PATCH 1/4] Remove stale CHECKs

Created using spr 1.3.6-beta.1
---
 clang/test/CodeGen/builtin-cpu-is.c | 20 
 1 file changed, 20 deletions(-)

diff --git a/clang/test/CodeGen/builtin-cpu-is.c 
b/clang/test/CodeGen/builtin-cpu-is.c
index e4a2071cf46795..b8dd97eeacebcf 100644
--- a/clang/test/CodeGen/builtin-cpu-is.c
+++ b/clang/test/CodeGen/builtin-cpu-is.c
@@ -7,8 +7,6 @@
 // global, the bit grab, and the icmp correct.
 extern void a(const char *);
 
-// CHECK: @__cpu_model = external dso_local global { i32, i32, i32, [1 x i32] }
-
 // CHECK-X86-LABEL: define dso_local void @intel(
 // CHECK-X86-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-X86-NEXT:  [[ENTRY:.*:]]
@@ -24,9 +22,6 @@ extern void a(const char *);
 void intel(void) {
   if (__builtin_cpu_is("intel"))
 a("intel");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr @__cpu_model
-  // CHECK: = icmp eq i32 [[LOAD]], 1
 }
 
 // CHECK-X86-LABEL: define dso_local void @amd(
@@ -44,9 +39,6 @@ void intel(void) {
 void amd(void) {
   if (__builtin_cpu_is("amd"))
 a("amd");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr @__cpu_model
-  // CHECK: = icmp eq i32 [[LOAD]], 2
 }
 
 // CHECK-X86-LABEL: define dso_local void @atom(
@@ -64,9 +56,6 @@ void amd(void) {
 void atom(void) {
   if (__builtin_cpu_is("atom"))
 a("atom");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr getelementptr inbounds ({ i32, 
i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 1)
-  // CHECK: = icmp eq i32 [[LOAD]], 1
 }
 
 // CHECK-X86-LABEL: define dso_local void @amdfam10h(
@@ -84,9 +73,6 @@ void atom(void) {
 void amdfam10h(void) {
   if (__builtin_cpu_is("amdfam10h"))
 a("amdfam10h");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr getelementptr inbounds ({ i32, 
i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 1)
-  // CHECK: = icmp eq i32 [[LOAD]], 4
 }
 
 // CHECK-X86-LABEL: define dso_local void @barcelona(
@@ -104,9 +90,6 @@ void amdfam10h(void) {
 void barcelona(void) {
   if (__builtin_cpu_is("barcelona"))
 a("barcelona");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr getelementptr inbounds ({ i32, 
i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 2)
-  // CHECK: = icmp eq i32 [[LOAD]], 4
 }
 
 // CHECK-X86-LABEL: define dso_local void @nehalem(
@@ -124,9 +107,6 @@ void barcelona(void) {
 void nehalem(void) {
   if (__builtin_cpu_is("nehalem"))
 a("nehalem");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr getelementptr inbounds ({ i32, 
i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 2)
-  // CHECK: = icmp eq i32 [[LOAD]], 1
 }
 #endif
 

>From 2bb2d5079b5bf98ba9f87e082ca3e67ab70068aa Mon Sep 17 00:00:00 2001
From: Wang Pengcheng 
Date: Thu, 14 Nov 2024 22:12:36 +0800
Subject: [PATCH 2/4] Simplify test

Created using spr 1.3.6-beta.1
---
 clang/test/CodeGen/builtin-cpu-is.c | 25 ++---
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/clang/test/CodeGen/builtin-cpu-is.c 
b/clang/test/CodeGen/builtin-cpu-is.c
index b8dd97eeacebcf..8e78213a7cfcfb 100644
--- a/clang/test/CodeGen/builtin-cpu-is.c
+++ b/clang/test/CodeGen/builtin-cpu-is.c
@@ -111,12 +111,9 @@ void nehalem(void) {
 #endif
 
 #ifdef __riscv
-// CHECK-RV64-LABEL: define dso_local signext i32 @test_riscv(
-// CHECK-RV64-SAME: i32 noundef signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-LABEL: define dso_local signext i32 @test_cpu_is_veyron_v1(
+// CHECK-RV64-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  [[ENTRY:.*:]]
-// CHECK-RV64-NEXT:[[RETVAL:%.*]] = alloca i32, align 4
-// CHECK-RV64-NEXT:[[A_ADDR:%.*]] = alloca i32, align 4
-// CHECK-RV64-NEXT:store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK-RV64-NEXT:[[TMP0:%.*]] = load i32, ptr @__riscv_cpu_model, align 4
 // CHECK-RV64-NEXT:[[TMP1:%.*]] = icmp eq i32 [[TMP0]], 1567
 // CHECK-RV64-NEXT:[[TMP2:%.*]] = load i64, ptr getelementptr inbounds ({ 
i32, i64, i64 }, ptr @__riscv_cpu_model, i32 0, i32 1), align 8
@@ -125,20 +122,10 @@ void nehalem(void) {
 // CHECK-RV64-NEXT:[[TMP5:%.*]] = load i64, ptr getelementptr inbounds ({ 
i32, i64, i64 }, ptr @__riscv_cpu_model, i32 0, i32 2), align 8
 // CHECK-RV64-NEXT:[[TMP6:%.*]] = icmp eq i64 [[TMP5]], 273
 // CHECK-RV64-NEXT:[[TMP7:%.*]] = and i1 [[TMP4]], [[TMP6]]
-// CHECK-RV64-NEXT:br i1 [[TMP7]], label %[[IF_THEN:.*]], label 
%[[IF_END:.*]]
-// CHECK-RV64:   [[IF_THEN]]:
-// CHECK-RV64-NEXT:store i32 3, ptr [[RETVAL]], align 4
-// CHECK-RV64-NEXT:br label %[[RETURN:.*]]
-// CHECK-RV64:   [[IF_END]]:
-// CHECK-RV64-NEXT:store i32 0, ptr [[RETVAL]], align 4
-// CHECK-RV64-NEXT:br label %[[RETURN]]
-// CHECK-RV64:   [[RETURN]]:
-// CHECK-RV64-NEXT:[[TMP8:%.*]] = load i32, ptr [[RETVAL]], align 4
-// CHECK-RV64-NEXT:ret i32 [[TM

[llvm-branch-commits] [clang] [llvm] [RISCV] Support __builtin_cpu_is (PR #116231)

2024-11-18 Thread Pengcheng Wang via llvm-branch-commits

https://github.com/wangpc-pp updated 
https://github.com/llvm/llvm-project/pull/116231

>From 9686a2c5c5276289e72d9098f497a9f246a1c457 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng 
Date: Thu, 14 Nov 2024 22:06:45 +0800
Subject: [PATCH 1/4] Remove stale CHECKs

Created using spr 1.3.6-beta.1
---
 clang/test/CodeGen/builtin-cpu-is.c | 20 
 1 file changed, 20 deletions(-)

diff --git a/clang/test/CodeGen/builtin-cpu-is.c 
b/clang/test/CodeGen/builtin-cpu-is.c
index e4a2071cf46795..b8dd97eeacebcf 100644
--- a/clang/test/CodeGen/builtin-cpu-is.c
+++ b/clang/test/CodeGen/builtin-cpu-is.c
@@ -7,8 +7,6 @@
 // global, the bit grab, and the icmp correct.
 extern void a(const char *);
 
-// CHECK: @__cpu_model = external dso_local global { i32, i32, i32, [1 x i32] }
-
 // CHECK-X86-LABEL: define dso_local void @intel(
 // CHECK-X86-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-X86-NEXT:  [[ENTRY:.*:]]
@@ -24,9 +22,6 @@ extern void a(const char *);
 void intel(void) {
   if (__builtin_cpu_is("intel"))
 a("intel");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr @__cpu_model
-  // CHECK: = icmp eq i32 [[LOAD]], 1
 }
 
 // CHECK-X86-LABEL: define dso_local void @amd(
@@ -44,9 +39,6 @@ void intel(void) {
 void amd(void) {
   if (__builtin_cpu_is("amd"))
 a("amd");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr @__cpu_model
-  // CHECK: = icmp eq i32 [[LOAD]], 2
 }
 
 // CHECK-X86-LABEL: define dso_local void @atom(
@@ -64,9 +56,6 @@ void amd(void) {
 void atom(void) {
   if (__builtin_cpu_is("atom"))
 a("atom");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr getelementptr inbounds ({ i32, 
i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 1)
-  // CHECK: = icmp eq i32 [[LOAD]], 1
 }
 
 // CHECK-X86-LABEL: define dso_local void @amdfam10h(
@@ -84,9 +73,6 @@ void atom(void) {
 void amdfam10h(void) {
   if (__builtin_cpu_is("amdfam10h"))
 a("amdfam10h");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr getelementptr inbounds ({ i32, 
i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 1)
-  // CHECK: = icmp eq i32 [[LOAD]], 4
 }
 
 // CHECK-X86-LABEL: define dso_local void @barcelona(
@@ -104,9 +90,6 @@ void amdfam10h(void) {
 void barcelona(void) {
   if (__builtin_cpu_is("barcelona"))
 a("barcelona");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr getelementptr inbounds ({ i32, 
i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 2)
-  // CHECK: = icmp eq i32 [[LOAD]], 4
 }
 
 // CHECK-X86-LABEL: define dso_local void @nehalem(
@@ -124,9 +107,6 @@ void barcelona(void) {
 void nehalem(void) {
   if (__builtin_cpu_is("nehalem"))
 a("nehalem");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr getelementptr inbounds ({ i32, 
i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 2)
-  // CHECK: = icmp eq i32 [[LOAD]], 1
 }
 #endif
 

>From 2bb2d5079b5bf98ba9f87e082ca3e67ab70068aa Mon Sep 17 00:00:00 2001
From: Wang Pengcheng 
Date: Thu, 14 Nov 2024 22:12:36 +0800
Subject: [PATCH 2/4] Simplify test

Created using spr 1.3.6-beta.1
---
 clang/test/CodeGen/builtin-cpu-is.c | 25 ++---
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/clang/test/CodeGen/builtin-cpu-is.c 
b/clang/test/CodeGen/builtin-cpu-is.c
index b8dd97eeacebcf..8e78213a7cfcfb 100644
--- a/clang/test/CodeGen/builtin-cpu-is.c
+++ b/clang/test/CodeGen/builtin-cpu-is.c
@@ -111,12 +111,9 @@ void nehalem(void) {
 #endif
 
 #ifdef __riscv
-// CHECK-RV64-LABEL: define dso_local signext i32 @test_riscv(
-// CHECK-RV64-SAME: i32 noundef signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-LABEL: define dso_local signext i32 @test_cpu_is_veyron_v1(
+// CHECK-RV64-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  [[ENTRY:.*:]]
-// CHECK-RV64-NEXT:[[RETVAL:%.*]] = alloca i32, align 4
-// CHECK-RV64-NEXT:[[A_ADDR:%.*]] = alloca i32, align 4
-// CHECK-RV64-NEXT:store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK-RV64-NEXT:[[TMP0:%.*]] = load i32, ptr @__riscv_cpu_model, align 4
 // CHECK-RV64-NEXT:[[TMP1:%.*]] = icmp eq i32 [[TMP0]], 1567
 // CHECK-RV64-NEXT:[[TMP2:%.*]] = load i64, ptr getelementptr inbounds ({ 
i32, i64, i64 }, ptr @__riscv_cpu_model, i32 0, i32 1), align 8
@@ -125,20 +122,10 @@ void nehalem(void) {
 // CHECK-RV64-NEXT:[[TMP5:%.*]] = load i64, ptr getelementptr inbounds ({ 
i32, i64, i64 }, ptr @__riscv_cpu_model, i32 0, i32 2), align 8
 // CHECK-RV64-NEXT:[[TMP6:%.*]] = icmp eq i64 [[TMP5]], 273
 // CHECK-RV64-NEXT:[[TMP7:%.*]] = and i1 [[TMP4]], [[TMP6]]
-// CHECK-RV64-NEXT:br i1 [[TMP7]], label %[[IF_THEN:.*]], label 
%[[IF_END:.*]]
-// CHECK-RV64:   [[IF_THEN]]:
-// CHECK-RV64-NEXT:store i32 3, ptr [[RETVAL]], align 4
-// CHECK-RV64-NEXT:br label %[[RETURN:.*]]
-// CHECK-RV64:   [[IF_END]]:
-// CHECK-RV64-NEXT:store i32 0, ptr [[RETVAL]], align 4
-// CHECK-RV64-NEXT:br label %[[RETURN]]
-// CHECK-RV64:   [[RETURN]]:
-// CHECK-RV64-NEXT:[[TMP8:%.*]] = load i32, ptr [[RETVAL]], align 4
-// CHECK-RV64-NEXT:ret i32 [[TM

[llvm-branch-commits] mimplid->mimpid (PR #116745)

2024-11-18 Thread Pengcheng Wang via llvm-branch-commits

https://github.com/wangpc-pp closed 
https://github.com/llvm/llvm-project/pull/116745
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] mimplid->mimpid (PR #116745)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-clang-codegen

Author: Pengcheng Wang (wangpc-pp)


Changes



---
Full diff: https://github.com/llvm/llvm-project/pull/116745.diff


1 Files Affected:

- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+3-3) 


``diff
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 24f6209af7afe4..84626f023ec3c1 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -22539,10 +22539,10 @@ Value *CodeGenFunction::EmitRISCVCpuIs(StringRef 
CPUStr) {
   Result = Builder.CreateAnd(
   Result, Builder.CreateICmpEQ(ArchID, 
Builder.getInt64(CPUModel.MArchID)));
 
-  // Compare mimplid.
-  Value *ImplID = loadRISCVCPUID(2);
+  // Compare mimpid.
+  Value *ImpID = loadRISCVCPUID(2);
   Result = Builder.CreateAnd(
-  Result, Builder.CreateICmpEQ(ImplID, Builder.getInt64(CPUModel.MImpID)));
+  Result, Builder.CreateICmpEQ(ImpID, Builder.getInt64(CPUModel.MImpID)));
 
   return Result;
 }

``




https://github.com/llvm/llvm-project/pull/116745
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] mimplid->mimpid (PR #116745)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: Pengcheng Wang (wangpc-pp)


Changes



---
Full diff: https://github.com/llvm/llvm-project/pull/116745.diff


1 Files Affected:

- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+3-3) 


``diff
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 24f6209af7afe4..84626f023ec3c1 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -22539,10 +22539,10 @@ Value *CodeGenFunction::EmitRISCVCpuIs(StringRef 
CPUStr) {
   Result = Builder.CreateAnd(
   Result, Builder.CreateICmpEQ(ArchID, 
Builder.getInt64(CPUModel.MArchID)));
 
-  // Compare mimplid.
-  Value *ImplID = loadRISCVCPUID(2);
+  // Compare mimpid.
+  Value *ImpID = loadRISCVCPUID(2);
   Result = Builder.CreateAnd(
-  Result, Builder.CreateICmpEQ(ImplID, Builder.getInt64(CPUModel.MImpID)));
+  Result, Builder.CreateICmpEQ(ImpID, Builder.getInt64(CPUModel.MImpID)));
 
   return Result;
 }

``




https://github.com/llvm/llvm-project/pull/116745
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [RISCV] Support __builtin_cpu_is (PR #116231)

2024-11-18 Thread Pengcheng Wang via llvm-branch-commits


@@ -22505,6 +22506,47 @@ Value 
*CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID,
   return nullptr;
 }
 
+Value *CodeGenFunction::EmitRISCVCpuIs(const CallExpr *E) {
+  const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
+  StringRef CPUStr = cast(CPUExpr)->getString();
+  return EmitRISCVCpuIs(CPUStr);
+}
+
+Value *CodeGenFunction::EmitRISCVCpuIs(StringRef CPUStr) {
+  llvm::Type *Int32Ty = Builder.getInt32Ty();
+  llvm::Type *Int64Ty = Builder.getInt64Ty();
+  llvm::StructType *StructTy = llvm::StructType::get(Int32Ty, Int64Ty, 
Int64Ty);
+  llvm::Constant *RISCVCPUModel =
+  CGM.CreateRuntimeVariable(StructTy, "__riscv_cpu_model");
+  cast(RISCVCPUModel)->setDSOLocal(true);
+
+  auto loadRISCVCPUID = [&](unsigned Index) {
+Value *Ptr = Builder.CreateStructGEP(StructTy, RISCVCPUModel, Index);
+Value *CPUID = Builder.CreateAlignedLoad(StructTy->getTypeAtIndex(Index),

wangpc-pp wrote:

Tried it, but `CreateLoad` in `CGBuilder` needs `Address` which also needs 
alignment.

https://github.com/llvm/llvm-project/pull/116231
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 global_load_lds_* instructions (PR #116680)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Nov 19, 12:46 AM EST**: A user started a stack merge that includes this 
pull request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/116680).


https://github.com/llvm/llvm-project/pull/116680
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 (PR #116679)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Nov 19, 12:46 AM EST**: A user started a stack merge that includes this 
pull request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/116679).


https://github.com/llvm/llvm-project/pull/116679
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port SpillPlacement analysis to NPM (PR #116618)

2024-11-18 Thread via llvm-branch-commits


@@ -38,13 +39,21 @@ class BitVector;
 class EdgeBundles;
 class MachineBlockFrequencyInfo;
 class MachineFunction;
+class SpillPlacementWrapperLegacy;
+class SpillPlacementAnalysis;
+
+class SpillPlacement {
+  friend class SpillPlacementWrapperLegacy;
+  friend class SpillPlacementAnalysis;
 
-class SpillPlacement : public MachineFunctionPass {
   struct Node;
+
   const MachineFunction *MF = nullptr;
   const EdgeBundles *bundles = nullptr;
   const MachineBlockFrequencyInfo *MBFI = nullptr;
-  Node *nodes = nullptr;
+
+  static void arrayDeleter(Node *N);
+  std::unique_ptr nodes;

paperchalice wrote:

An outlined default destructor would work. 🤔

https://github.com/llvm/llvm-project/pull/116618
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 global_load_lds_* instructions (PR #116680)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits


@@ -1980,6 +1985,10 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS 
<0x028, 0x12>;
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Real_AllAddr_LDS <0x02a, 0x14>;
 
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Real_AllAddr_LDS <0x07e, 0x07e>;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Real_AllAddr_LDS <0x07d, 0x07d>;
+

arsenm wrote:

whatever was in the merge 

https://github.com/llvm/llvm-project/pull/116680
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds (PR #116681)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Nov 19, 12:46 AM EST**: A user started a stack merge that includes this 
pull request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/116681).


https://github.com/llvm/llvm-project/pull/116681
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port SpillPlacement analysis to NPM (PR #116618)

2024-11-18 Thread Akshat Oke via llvm-branch-commits


@@ -38,13 +39,21 @@ class BitVector;
 class EdgeBundles;
 class MachineBlockFrequencyInfo;
 class MachineFunction;
+class SpillPlacementWrapperLegacy;
+class SpillPlacementAnalysis;
+
+class SpillPlacement {
+  friend class SpillPlacementWrapperLegacy;
+  friend class SpillPlacementAnalysis;
 
-class SpillPlacement : public MachineFunctionPass {
   struct Node;
+
   const MachineFunction *MF = nullptr;
   const EdgeBundles *bundles = nullptr;
   const MachineBlockFrequencyInfo *MBFI = nullptr;
-  Node *nodes = nullptr;
+
+  static void arrayDeleter(Node *N);
+  std::unique_ptr nodes;

optimisan wrote:

The definition of `Node` is not available here, so the default deleter fails to 
compile sizeof(Node) for this incomplete type. To hack around it I put the 
definition of `arrayDeleter` in the implementation where struct Node is defined.

But changing to `unique_ptr` facilitates removal of 
`.get()` calls 

https://github.com/llvm/llvm-project/pull/116618
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [flang][OpenMP] Apply modifier representation to semantic checks (PR #116658)

2024-11-18 Thread Krzysztof Parzyszek via llvm-branch-commits

https://github.com/kparzysz updated 
https://github.com/llvm/llvm-project/pull/116658

>From fac6a8594643811418f37ee42fc1ac35bcc2a244 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek 
Date: Thu, 14 Nov 2024 07:29:59 -0600
Subject: [PATCH 1/2] [flang][OpenMP] Apply modifier representation to semantic
 checks

Also, define helper macros in parse-tree.h.

Apply the new modifier representation to the DEFAULTMAP and REDUCTION
clauses, with testcases utilizing the new modifier validation.

OpenMP modifier overhaul: #3/3
---
 flang/include/flang/Parser/dump-parse-tree.h  |  8 +-
 flang/include/flang/Parser/parse-tree.h   | 49 +--
 .../flang/Semantics/openmp-modifiers.h|  4 +
 flang/lib/Lower/OpenMP/Clauses.cpp| 33 
 flang/lib/Parser/openmp-parsers.cpp   | 40 +
 flang/lib/Parser/unparse.cpp  | 15 ++--
 flang/lib/Semantics/check-omp-structure.cpp   | 83 +++
 flang/lib/Semantics/check-omp-structure.h |  3 +-
 flang/lib/Semantics/openmp-modifiers.cpp  | 33 
 flang/lib/Semantics/resolve-directives.cpp| 52 +++-
 .../test/Parser/OpenMP/defaultmap-clause.f90  |  8 +-
 .../test/Parser/OpenMP/defaultmap-unparse.f90 | 16 ++--
 .../test/Parser/OpenMP/reduction-modifier.f90 |  6 +-
 .../Semantics/OpenMP/combined-constructs.f90  | 12 +--
 .../OpenMP/defaultmap-clause-v45.f90  |  2 +-
 15 files changed, 236 insertions(+), 128 deletions(-)

diff --git a/flang/include/flang/Parser/dump-parse-tree.h 
b/flang/include/flang/Parser/dump-parse-tree.h
index df5bf1d8d3200e..9c59ce520a31aa 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -509,9 +509,11 @@ class ParseTreeDumper {
   NODE(parser, OmpDeclareMapperSpecifier)
   NODE(parser, OmpDefaultClause)
   NODE_ENUM(OmpDefaultClause, Type)
+  NODE(parser, OmpVariableCategory)
+  NODE_ENUM(OmpVariableCategory, Value)
   NODE(parser, OmpDefaultmapClause)
   NODE_ENUM(OmpDefaultmapClause, ImplicitBehavior)
-  NODE_ENUM(OmpDefaultmapClause, VariableCategory)
+  NODE(OmpDefaultmapClause, Modifier)
   NODE(parser, OmpDependenceType)
   NODE_ENUM(OmpDependenceType, Value)
   NODE(parser, OmpTaskDependenceType)
@@ -567,8 +569,10 @@ class ParseTreeDumper {
   NODE_ENUM(OmpBindClause, Type)
   NODE(parser, OmpProcBindClause)
   NODE_ENUM(OmpProcBindClause, Type)
-  NODE_ENUM(OmpReductionClause, ReductionModifier)
+  NODE(parser, OmpReductionModifier)
+  NODE_ENUM(OmpReductionModifier, Value)
   NODE(parser, OmpReductionClause)
+  NODE(OmpReductionClause, Modifier)
   NODE(parser, OmpInReductionClause)
   NODE(parser, OmpReductionCombiner)
   NODE(OmpReductionCombiner, FunctionCombiner)
diff --git a/flang/include/flang/Parser/parse-tree.h 
b/flang/include/flang/Parser/parse-tree.h
index ef49a36578270e..5b28bcd4e21b80 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3440,6 +3440,16 @@ struct OmpObject {
 
 WRAPPER_CLASS(OmpObjectList, std::list);
 
+#define MODIFIER_BOILERPLATE(...) \
+  struct Modifier { \
+using Variant = std::variant<__VA_ARGS__>; \
+UNION_CLASS_BOILERPLATE(Modifier); \
+CharBlock source; \
+Variant u; \
+  }
+
+#define MODIFIERS() std::optional>
+
 inline namespace modifier {
 // For uniformity, in all keyword modifiers the name of the type defined
 // by ENUM_CLASS is "Value", e.g.
@@ -3505,12 +3515,20 @@ struct OmpLinearModifier {
 //   - |// since 4.5, until 5.2
 //   + | * | .AND. | .OR. | .EQV. | .NEQV. |// since 4.5
 //   MIN | MAX | IAND | IOR | IEOR  // since 4.5
-//
 struct OmpReductionIdentifier {
   UNION_CLASS_BOILERPLATE(OmpReductionIdentifier);
   std::variant u;
 };
 
+// Ref: [5.0:300-302], [5.1:332-334], [5.2:134-137]
+//
+// reduction-modifier ->
+//   DEFAULT | INSCAN | TASK// since 5.0
+struct OmpReductionModifier {
+  ENUM_CLASS(Value, Default, Inscan, Task);
+  WRAPPER_CLASS_BOILERPLATE(OmpReductionModifier, Value);
+};
+
 // Ref: [4.5:169-170], [5.0:254-256], [5.1:287-289], [5.2:321]
 //
 // task-dependence-type -> // "dependence-type" in 5.1 and before
@@ -3521,6 +3539,17 @@ struct OmpTaskDependenceType {
   ENUM_CLASS(Value, In, Out, Inout, Inoutset, Mutexinoutset, Depobj)
   WRAPPER_CLASS_BOILERPLATE(OmpTaskDependenceType, Value);
 };
+
+// Ref: [4.5:229-230], [5.0:324-325], [5.1:357-358], [5.2:161-162]
+//
+// variable-category ->
+//   SCALAR |   // since 4.5
+//   AGGREGATE | ALLOCATABLE | POINTER |// since 5.0
+//   ALL// since 5.2
+struct OmpVariableCategory {
+  ENUM_CLASS(Value, Aggregate, All, Allocatable, Pointer, Scalar)
+  WRAPPER_CLASS_BOILERPLATE(OmpVariableCategory, Value);
+};
 } // namespace modifier
 
 // --- Clauses
@@ -3578,8 +3607,8 @@ struct OmpDefaultmapClause {
   TUPLE_CLASS_BOILERPLATE(OmpDef

[llvm-branch-commits] [clang] [llvm] AMDGPU: Add first gfx950 mfma instructions (PR #116312)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/116312

>From 6c8fd97756f9b08e3562a8702b2aae186ef72075 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 21 Nov 2023 10:03:19 +0900
Subject: [PATCH] AMDGPU: Add first gfx950 mfma instructions

Scheduling info and hazards are wrong and TBD.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   6 +
 .../CodeGenOpenCL/builtins-amdgcn-mfma.cl |  25 +-
 .../builtins-amdgcn-error-gfx950-param.cl |  21 ++
 .../builtins-amdgcn-error-gfx950.cl   |  12 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   9 +
 llvm/lib/Target/AMDGPU/AMDGPU.td  |   4 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   4 +-
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   2 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |   4 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   4 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  22 ++
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |  17 ++
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 274 ++
 llvm/test/MC/AMDGPU/mai-gfx950.s  | 112 +++
 .../MC/Disassembler/AMDGPU/gfx950_mai.txt |  61 
 llvm/test/tools/llvm-mca/AMDGPU/gfx950.s  |  18 ++
 16 files changed, 592 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
 create mode 100644 llvm/test/MC/AMDGPU/mai-gfx950.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
 create mode 100644 llvm/test/tools/llvm-mca/AMDGPU/gfx950.s

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 61516eb2a4a723..6917d8d1aca69d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -431,6 +431,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", 
"nc", "fp8-conversion-
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", 
"fp8-conversion-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", 
"fp8-conversion-insts")
 
+//===--===//
+// GFX950 only builtins.
+//===--===//
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", 
"nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", 
"nc", "gfx950-insts")
+
 
//===--===//
 // GFX12+ only builtins.
 
//===--===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index dcdeee6b6acc40..a644a60f9ec381 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -2,6 +2,7 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 
-DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX908
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a 
-DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX90A
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 
-DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX940
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 
-DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX950
 
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 
@@ -222,7 +223,7 @@ void test_mfma_f64_4x4x4f64(global double* out, double a, 
double b, double c)
 
 #endif // MFMA_GFX90A_TESTS
 
-#ifdef MFMA_GFX940_TESTS
+#if defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
 // CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8
 // CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 
%b, <4 x i32> %c, i32 0, i32 0, i32 0)
 void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c)
@@ -404,4 +405,24 @@ void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, 
v2i a, v4i b, v16f c, in
 {
   *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0);
 }
-#endif // MFMA_GFX940_TESTS
+#endif // defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
+
+#ifdef MFMA_GFX950_TESTS
+
+// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_f16(
+// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x 
half> %a, <8 x half> %b, <4 x float> %c, i32 1, i32 2, i32 3)
+
+v4f test_mfma_f32_16x16x32_f16(v8h a, v8h b, v4f c)
+{
+  return __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 1, 2, 3);
+}
+
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_f16
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32

[llvm-branch-commits] [llvm] AMDGPU: Add V_CVT_PK_BF16_F32 for gfx950 (PR #116678)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/116678

>From 1adfc6bf758377390753d35df51fb7a294202238 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap 
Date: Mon, 5 Feb 2024 04:29:01 -0500
Subject: [PATCH] AMDGPU: Add V_CVT_PK_BF16_F32 for gfx950

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |   6 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   1 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  25 ++
 llvm/test/CodeGen/AMDGPU/bf16-conversions.ll  | 395 --
 llvm/test/MC/AMDGPU/gfx950_asm_vop3.s |  26 ++
 .../Disassembler/AMDGPU/gfx950_dasm_vop3.txt  |  19 +
 6 files changed, 255 insertions(+), 217 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/gfx950_asm_vop3.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1e261f4256c93b..ad89812558d25c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -889,6 +889,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::MUL, MVT::i1, Promote);
 
+  if (Subtarget->hasBF16ConversionInsts()) {
+setOperationAction(ISD::FP_ROUND, MVT::v2bf16, Legal);
+setOperationAction(ISD::FP_ROUND, MVT::bf16, Legal);
+setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
+  }
+
   setTargetDAGCombine({ISD::ADD,
ISD::UADDO_CARRY,
ISD::SUB,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 882e147dc231fa..7df9be5c6f7a0b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2787,6 +2787,7 @@ def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, 
untyped]>;
 def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], 
/*EnableClamp=*/1>;
 def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
 def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
+def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
 
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td 
b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 551e8b3a679202..917e1b3974b46a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -944,6 +944,30 @@ let SubtargetPredicate = isGFX11Plus in {
   defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", 
VOP3_Profile>;
 } // End SubtargetPredicate = isGFX11Plus
 
+// FIXME: GlobalISel cannot distinguish f16 and bf16 and may start using bf16 
patterns
+//instead of less complex f16. Disable GlobalISel for these for now.
+def bf16_fpround : PatFrag <(ops node:$src0),  (fpround $src0), [{ return 
true; }]> {
+  let GISelPredicateCode = [{return false;}];
+}
+
+let SubtargetPredicate = HasBF16ConversionInsts in {
+  let ReadsModeReg = 0 in {
+defm V_CVT_PK_BF16_F32: VOP3Inst<"v_cvt_pk_bf16_f32", 
VOP3_Profile>;
+  }
+  def : GCNPat<(v2bf16 (bf16_fpround v2f32:$src)),
+   (V_CVT_PK_BF16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 
0, (EXTRACT_SUBREG VReg_64:$src, sub1))>;
+  def : GCNPat<(v2bf16 (bf16_fpround v2f64:$src)),
+   (V_CVT_PK_BF16_F32_e64 0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG 
VReg_128:$src, sub0_sub1)),
+  0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG 
VReg_128:$src, sub2_sub3)))>;
+  def : GCNPat<(v2bf16 (build_vector (bf16 (bf16_fpround (f32 (VOP3Mods 
f32:$src0, i32:$src0_modifiers,
+ (bf16 (bf16_fpround (f32 (VOP3Mods 
f32:$src1, i32:$src1_modifiers)),
+   (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, 
$src1)>;
+  def : GCNPat<(bf16 (bf16_fpround (f32 (VOP3Mods f32:$src0, 
i32:$src0_modifiers,
+   (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 
(IMPLICIT_DEF)))>;
+  def : GCNPat<(bf16 (bf16_fpround (f64 (VOP3Mods f64:$src0, 
i32:$src0_modifiers,
+   (V_CVT_PK_BF16_F32_e64 0, (f32 (V_CVT_F32_F64_e64 
$src0_modifiers, $src0)), 0, (f32 (IMPLICIT_DEF)))>;
+}
+
 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
   defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", 
VOP3_Profile>;
   defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", 
VOP3_Profile>;
@@ -1721,5 +1745,6 @@ defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
 
 defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>;
 defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>;
+defm V_CVT_PK_BF16_F32: VOP3OpSel_Real_gfx9 <0x268>;
 defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>;
 defm V_CVT_SR_BF8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a5>;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll 
b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
inde

[llvm-branch-commits] [flang] [flang][OpenMP] Apply modifier representation to semantic checks (PR #116658)

2024-11-18 Thread Krzysztof Parzyszek via llvm-branch-commits

https://github.com/kparzysz updated 
https://github.com/llvm/llvm-project/pull/116658

>From fac6a8594643811418f37ee42fc1ac35bcc2a244 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek 
Date: Thu, 14 Nov 2024 07:29:59 -0600
Subject: [PATCH 1/3] [flang][OpenMP] Apply modifier representation to semantic
 checks

Also, define helper macros in parse-tree.h.

Apply the new modifier representation to the DEFAULTMAP and REDUCTION
clauses, with testcases utilizing the new modifier validation.

OpenMP modifier overhaul: #3/3
---
 flang/include/flang/Parser/dump-parse-tree.h  |  8 +-
 flang/include/flang/Parser/parse-tree.h   | 49 +--
 .../flang/Semantics/openmp-modifiers.h|  4 +
 flang/lib/Lower/OpenMP/Clauses.cpp| 33 
 flang/lib/Parser/openmp-parsers.cpp   | 40 +
 flang/lib/Parser/unparse.cpp  | 15 ++--
 flang/lib/Semantics/check-omp-structure.cpp   | 83 +++
 flang/lib/Semantics/check-omp-structure.h |  3 +-
 flang/lib/Semantics/openmp-modifiers.cpp  | 33 
 flang/lib/Semantics/resolve-directives.cpp| 52 +++-
 .../test/Parser/OpenMP/defaultmap-clause.f90  |  8 +-
 .../test/Parser/OpenMP/defaultmap-unparse.f90 | 16 ++--
 .../test/Parser/OpenMP/reduction-modifier.f90 |  6 +-
 .../Semantics/OpenMP/combined-constructs.f90  | 12 +--
 .../OpenMP/defaultmap-clause-v45.f90  |  2 +-
 15 files changed, 236 insertions(+), 128 deletions(-)

diff --git a/flang/include/flang/Parser/dump-parse-tree.h 
b/flang/include/flang/Parser/dump-parse-tree.h
index df5bf1d8d3200e..9c59ce520a31aa 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -509,9 +509,11 @@ class ParseTreeDumper {
   NODE(parser, OmpDeclareMapperSpecifier)
   NODE(parser, OmpDefaultClause)
   NODE_ENUM(OmpDefaultClause, Type)
+  NODE(parser, OmpVariableCategory)
+  NODE_ENUM(OmpVariableCategory, Value)
   NODE(parser, OmpDefaultmapClause)
   NODE_ENUM(OmpDefaultmapClause, ImplicitBehavior)
-  NODE_ENUM(OmpDefaultmapClause, VariableCategory)
+  NODE(OmpDefaultmapClause, Modifier)
   NODE(parser, OmpDependenceType)
   NODE_ENUM(OmpDependenceType, Value)
   NODE(parser, OmpTaskDependenceType)
@@ -567,8 +569,10 @@ class ParseTreeDumper {
   NODE_ENUM(OmpBindClause, Type)
   NODE(parser, OmpProcBindClause)
   NODE_ENUM(OmpProcBindClause, Type)
-  NODE_ENUM(OmpReductionClause, ReductionModifier)
+  NODE(parser, OmpReductionModifier)
+  NODE_ENUM(OmpReductionModifier, Value)
   NODE(parser, OmpReductionClause)
+  NODE(OmpReductionClause, Modifier)
   NODE(parser, OmpInReductionClause)
   NODE(parser, OmpReductionCombiner)
   NODE(OmpReductionCombiner, FunctionCombiner)
diff --git a/flang/include/flang/Parser/parse-tree.h 
b/flang/include/flang/Parser/parse-tree.h
index ef49a36578270e..5b28bcd4e21b80 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3440,6 +3440,16 @@ struct OmpObject {
 
 WRAPPER_CLASS(OmpObjectList, std::list);
 
+#define MODIFIER_BOILERPLATE(...) \
+  struct Modifier { \
+using Variant = std::variant<__VA_ARGS__>; \
+UNION_CLASS_BOILERPLATE(Modifier); \
+CharBlock source; \
+Variant u; \
+  }
+
+#define MODIFIERS() std::optional>
+
 inline namespace modifier {
 // For uniformity, in all keyword modifiers the name of the type defined
 // by ENUM_CLASS is "Value", e.g.
@@ -3505,12 +3515,20 @@ struct OmpLinearModifier {
 //   - |// since 4.5, until 5.2
 //   + | * | .AND. | .OR. | .EQV. | .NEQV. |// since 4.5
 //   MIN | MAX | IAND | IOR | IEOR  // since 4.5
-//
 struct OmpReductionIdentifier {
   UNION_CLASS_BOILERPLATE(OmpReductionIdentifier);
   std::variant u;
 };
 
+// Ref: [5.0:300-302], [5.1:332-334], [5.2:134-137]
+//
+// reduction-modifier ->
+//   DEFAULT | INSCAN | TASK// since 5.0
+struct OmpReductionModifier {
+  ENUM_CLASS(Value, Default, Inscan, Task);
+  WRAPPER_CLASS_BOILERPLATE(OmpReductionModifier, Value);
+};
+
 // Ref: [4.5:169-170], [5.0:254-256], [5.1:287-289], [5.2:321]
 //
 // task-dependence-type -> // "dependence-type" in 5.1 and before
@@ -3521,6 +3539,17 @@ struct OmpTaskDependenceType {
   ENUM_CLASS(Value, In, Out, Inout, Inoutset, Mutexinoutset, Depobj)
   WRAPPER_CLASS_BOILERPLATE(OmpTaskDependenceType, Value);
 };
+
+// Ref: [4.5:229-230], [5.0:324-325], [5.1:357-358], [5.2:161-162]
+//
+// variable-category ->
+//   SCALAR |   // since 4.5
+//   AGGREGATE | ALLOCATABLE | POINTER |// since 5.0
+//   ALL// since 5.2
+struct OmpVariableCategory {
+  ENUM_CLASS(Value, Aggregate, All, Allocatable, Pointer, Scalar)
+  WRAPPER_CLASS_BOILERPLATE(OmpVariableCategory, Value);
+};
 } // namespace modifier
 
 // --- Clauses
@@ -3578,8 +3607,8 @@ struct OmpDefaultmapClause {
   TUPLE_CLASS_BOILERPLATE(OmpDef

[llvm-branch-commits] [llvm] [NFC] Use unique_ptr in SparseSet (PR #116617)

2024-11-18 Thread Akshat Oke via llvm-branch-commits

https://github.com/optimisan created 
https://github.com/llvm/llvm-project/pull/116617

This allows implementing the move constructor.

>From 8a5db30a3841b88ccac2c781d933eeb45560fdfa Mon Sep 17 00:00:00 2001
From: Akshat Oke 
Date: Mon, 18 Nov 2024 10:15:19 +
Subject: [PATCH] [NFC] Use unique_ptr in SparseSet

This allows implementing the move constructor.
---
 llvm/include/llvm/ADT/SparseSet.h | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/ADT/SparseSet.h 
b/llvm/include/llvm/ADT/SparseSet.h
index c7793117ff5408..1adae0d4595ac4 100644
--- a/llvm/include/llvm/ADT/SparseSet.h
+++ b/llvm/include/llvm/ADT/SparseSet.h
@@ -129,7 +129,12 @@ class SparseSet {
   using DenseT = SmallVector;
   using size_type = unsigned;
   DenseT Dense;
-  SparseT *Sparse = nullptr;
+
+  struct Deleter {
+void operator()(SparseT *S) { free(S); }
+  };
+  std::unique_ptr Sparse;
+
   unsigned Universe = 0;
   KeyFunctorT KeyIndexOf;
   SparseSetValFunctor ValIndexOf;
@@ -144,7 +149,7 @@ class SparseSet {
   SparseSet() = default;
   SparseSet(const SparseSet &) = delete;
   SparseSet &operator=(const SparseSet &) = delete;
-  ~SparseSet() { free(Sparse); }
+  SparseSet(SparseSet &&) = default;
 
   /// setUniverse - Set the universe size which determines the largest key the
   /// set can hold.  The universe must be sized before any elements can be
@@ -159,11 +164,10 @@ class SparseSet {
 // Hysteresis prevents needless reallocations.
 if (U >= Universe/4 && U <= Universe)
   return;
-free(Sparse);
 // The Sparse array doesn't actually need to be initialized, so malloc
 // would be enough here, but that will cause tools like valgrind to
 // complain about branching on uninitialized data.
-Sparse = static_cast(safe_calloc(U, sizeof(SparseT)));
+Sparse.reset(static_cast(safe_calloc(U, sizeof(SparseT;
 Universe = U;
   }
 
@@ -205,7 +209,7 @@ class SparseSet {
 assert(Idx < Universe && "Key out of range");
 assert(Sparse != nullptr && "Invalid sparse type");
 const unsigned Stride = std::numeric_limits::max() + 1u;
-for (unsigned i = Sparse[Idx], e = size(); i < e; i += Stride) {
+for (unsigned i = Sparse.get()[Idx], e = size(); i < e; i += Stride) {
   const unsigned FoundIdx = ValIndexOf(Dense[i]);
   assert(FoundIdx < Universe && "Invalid key in set. Did object mutate?");
   if (Idx == FoundIdx)
@@ -255,7 +259,7 @@ class SparseSet {
 iterator I = findIndex(Idx);
 if (I != end())
   return std::make_pair(I, false);
-Sparse[Idx] = size();
+Sparse.get()[Idx] = size();
 Dense.push_back(Val);
 return std::make_pair(end() - 1, true);
   }
@@ -292,7 +296,7 @@ class SparseSet {
   *I = Dense.back();
   unsigned BackIdx = ValIndexOf(Dense.back());
   assert(BackIdx < Universe && "Invalid key in set. Did object mutate?");
-  Sparse[BackIdx] = I - begin();
+  Sparse.get()[BackIdx] = I - begin();
 }
 // This depends on SmallVector::pop_back() not invalidating iterators.
 // std::vector::pop_back() doesn't give that guarantee.

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][LLVM] `LLVMTypeConverter`: Tighten materialization checks (PR #116532)

2024-11-18 Thread Markus Böck via llvm-branch-commits

https://github.com/zero9178 approved this pull request.

LGTM, thank you :))

https://github.com/llvm/llvm-project/pull/116532
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [MSVC] work-around for compile time issue 102513 (PR #111314)

2024-11-18 Thread Nikita Popov via llvm-branch-commits

nikic wrote:

> @tru should this have been merged? Do I need to do something to facilitate? 
> Sorry for not following up earlier I have been sick recently. Thanks.

Backport PRs need to be part of the release milestone, otherwise they're likely 
to get forgotten about :) I added it just now.

https://github.com/llvm/llvm-project/pull/111314
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 global_load_lds_* instructions (PR #116680)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/116680

>From 6711ea8a2ae2f0e50488cab587937fa6a3e00ea7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Thu, 18 Jan 2024 14:44:03 +0700
Subject: [PATCH] AMDGPU: Handle gfx950 global_load_lds_* instructions

Define global_load_lds_dwordx3 and global_load_dwordx4.
Oddly it seems dwordx2 was skipped.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   2 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp  |  10 ++
 llvm/lib/Target/AMDGPU/FLATInstructions.td|   9 ++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |   7 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  10 ++
 .../llvm.amdgcn.global.load.lds.gfx950.ll | 137 ++
 llvm/test/MC/AMDGPU/gfx950_asm_features.s |  37 +
 llvm/test/MC/Disassembler/AMDGPU/gfx950.txt   |  25 
 8 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll
 create mode 100644 llvm/test/MC/AMDGPU/gfx950_asm_features.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx950.txt

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 15f33cdbf92e6e..f43ab50d2ea441 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2452,7 +2452,7 @@ class AMDGPUGlobalLoadLDS :
 [],
 [LLVMQualPointerType<1>,// Base global pointer to load from
  LLVMQualPointerType<3>,// LDS base pointer to store to
- llvm_i32_ty,   // Data byte size: 1/2/4
+ llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for 
gfx950)
  llvm_i32_ty,   // imm offset (applied to both global 
and LDS address)
  llvm_i32_ty],  // auxiliary data (imm, cachepolicy 
(bit 0 = sc0,
 //   
bit 1 = sc1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 13de93e829fab2..a6ef0069f134bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3329,6 +3329,16 @@ bool 
AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
   case 4:
 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
 break;
+  case 12:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
+break;
+  case 16:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
+break;
   }
 
   MachineBasicBlock *MBB = MI.getParent();
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index db74372e9db452..861fcf017d9e4d 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -934,6 +934,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_usho
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_sshort">;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dword">;
 
+let SubtargetPredicate = HasGFX950Insts in {
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dwordx3">;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dwordx4">;
+}
+
 let SubtargetPredicate = isGFX12Plus in {
   defm GLOBAL_ATOMIC_COND_SUB_U32: FLAT_Global_Atomic_Pseudo 
<"global_atomic_cond_sub_u32", VGPR_32, i32>;
   defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo 
<"global_atomic_ordered_add_b64", VReg_64, i64>;
@@ -1980,6 +1985,10 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS 
<0x028, 0x12>;
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Real_AllAddr_LDS <0x02a, 0x14>;
 
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Real_AllAddr_LDS <0x07e, 0x07e>;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Real_AllAddr_LDS <0x07d, 0x07d>;
+
+
 defm GLOBAL_ATOMIC_SWAP   : FLAT_Global_Real_Atomics_vi <0x40>;
 defm GLOBAL_ATOMIC_CMPSWAP: FLAT_Global_Real_Atomics_vi <0x41>;
 defm GLOBAL_ATOMIC_ADD: FLAT_Global_Real_Atomics_vi <0x42>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4a6efe533230b1..f3f96940c1f44b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1289,6 +1289,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // hasGFX940Insts and hasGFX90AInsts are also true.
   bool hasGFX950Insts() const { return GFX950Insts; }
 
+  /// Returns true if the target supports
+  /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
+  /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
+  bool hasLDSLoadB96_B128() const {
+return h

[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds (PR #116681)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/116681

>From e6110347d262f74c2f2c76dfde113723ac21115c Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Thu, 18 Jan 2024 16:18:05 +0700
Subject: [PATCH] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds

Enforcing this limit in the clang builtin will come later.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   8 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp  |  18 ++
 llvm/lib/Target/AMDGPU/BUFInstructions.td |  24 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  16 ++
 .../llvm.amdgcn.global.load.lds.gfx950.ll |   8 +
 ...m.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll | 176 
 ...mdgcn.struct.ptr.buffer.load.lds.gfx950.ll | 196 ++
 llvm/test/MC/AMDGPU/mubuf-gfx950.s|  32 +++
 llvm/test/MC/Disassembler/AMDGPU/gfx950.txt   |  19 ++
 9 files changed, 485 insertions(+), 12 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll
 create mode 100644 llvm/test/MC/AMDGPU/mubuf-gfx950.s

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f43ab50d2ea441..360af786c5160d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1674,7 +1674,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
llvm_i32_ty,   // imm offset(imm, included in bounds checking 
and swizzling)
@@ -1693,7 +1693,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,// rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
llvm_i32_ty,   // imm offset(imm, included in bounds checking 
and swizzling)
@@ -1715,7 +1715,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // vindex(VGPR)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
@@ -1735,7 +1735,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,// rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // vindex(VGPR)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a6ef0069f134bd..3522ece24f1c45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3240,6 +3240,24 @@ bool 
AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
  : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
 break;
+  case 12:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+
+Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+break;
+  case 16:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+
+Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+break;
   }
 
   Ma

[llvm-branch-commits] [clang] [libcxx] [libcxxabi] [Fuchsia][cmake] Allow using FatLTO when building runtimes (PR #112277)

2024-11-18 Thread Paul Kirth via llvm-branch-commits

https://github.com/ilovepi updated 
https://github.com/llvm/llvm-project/pull/112277

>From 1dafa521d5a1e10e3f79f63a661b2e14acff5a4a Mon Sep 17 00:00:00 2001
From: Paul Kirth 
Date: Mon, 14 Oct 2024 15:06:38 -0700
Subject: [PATCH 1/4] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.4
---
 libcxx/CMakeLists.txt|  4 
 libcxx/src/CMakeLists.txt| 10 ++
 libcxxabi/src/CMakeLists.txt | 10 ++
 3 files changed, 24 insertions(+)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index f1942e963ccc31..5a68237f7336c5 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -102,6 +102,10 @@ option(LIBCXX_ENABLE_WIDE_CHARACTERS
support the C functionality for wide characters. When wide characters are
not supported, several parts of the library will be disabled, notably the
wide character specializations of std::basic_string." ON)
+ option(LIBCXX_ENABLE_FATLTO
+   "Whether to compile libc++ with FatLTO enabled." ON)
+ option(LIBCXX_ENABLE_LTO
+   "Whether to compile libc++ with LTO enabled." ON)
 
 # To use time zone support in libc++ the platform needs to have the IANA
 # database installed. Libc++ will fail to build if this is enabled on a
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index b187677ff2db52..670db758f53173 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -173,6 +173,16 @@ if (APPLE AND LLVM_USE_SANITIZER)
   endif()
 endif()
 
+
+if(LIBCXX_ENABLE_LTO)
+  list(APPEND LIBCXX_COMPILE_FLAGS "-flto")
+  list(APPEND LIBCXX_LINK_FLAGS "-flto")
+endif()
+if(LIBCXX_ENABLE_FATLTO)
+  list(APPEND LIBCXX_COMPILE_FLAGS "-ffat-lto-objects")
+  list(APPEND LIBCXX_LINK_FLAGS "-ffat-lto-objects")
+endif()
+
 split_list(LIBCXX_COMPILE_FLAGS)
 split_list(LIBCXX_LINK_FLAGS)
 
diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index 480e528b819bb9..822ede39c6a525 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -143,6 +143,15 @@ if ( APPLE )
   endif()
 endif()
 
+if(LIBCXX_ENABLE_LTO)
+  list(APPEND LIBCXXABI_COMPILE_FLAGS "-flto")
+  list(APPEND LIBCXXABI_LINK_FLAGS "-flto")
+endif()
+if(LIBCXX_ENABLE_FATLTO)
+  list(APPEND LIBCXXABI_COMPILE_FLAGS "-ffat-lto-objects")
+  list(APPEND LIBCXXABI_LINK_FLAGS "-ffat-lto-objects")
+endif()
+
 split_list(LIBCXXABI_COMPILE_FLAGS)
 split_list(LIBCXXABI_LINK_FLAGS)
 
@@ -154,6 +163,7 @@ endif()
 
 include(WarningFlags)
 
+
 # Build the shared library.
 add_library(cxxabi_shared_objects OBJECT EXCLUDE_FROM_ALL ${LIBCXXABI_SOURCES} 
${LIBCXXABI_HEADERS})
 cxx_add_warning_flags(cxxabi_shared_objects ${LIBCXXABI_ENABLE_WERROR} 
${LIBCXXABI_ENABLE_PEDANTIC})

>From 38851d29d9eaf5e3c597be3f9f57179f308ba335 Mon Sep 17 00:00:00 2001
From: Paul Kirth 
Date: Mon, 14 Oct 2024 15:27:36 -0700
Subject: [PATCH 2/4] Remove newline from diff

Created using spr 1.3.4
---
 libcxxabi/src/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index 1a1e57aa0077b4..783f17583c62e0 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -163,7 +163,6 @@ endif()
 
 include(WarningFlags)
 
-
 # Build the shared library.
 add_library(cxxabi_shared_objects OBJECT EXCLUDE_FROM_ALL ${LIBCXXABI_SOURCES} 
${LIBCXXABI_HEADERS})
 cxx_add_warning_flags(cxxabi_shared_objects ${LIBCXXABI_ENABLE_WERROR} 
${LIBCXXABI_ENABLE_PEDANTIC})

>From 535f2f2c17a3c80aa12c0106a468a8f2127241fc Mon Sep 17 00:00:00 2001
From: Paul Kirth 
Date: Wed, 16 Oct 2024 11:20:51 -0700
Subject: [PATCH 3/4] Avoid unecessary changes to libc++ cmake

Created using spr 1.3.4
---
 clang/cmake/caches/Fuchsia-stage2.cmake |  8 
 libcxx/CMakeLists.txt   |  4 
 libcxx/src/CMakeLists.txt   | 10 --
 libcxxabi/src/CMakeLists.txt|  9 -
 4 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake 
b/clang/cmake/caches/Fuchsia-stage2.cmake
index 5af98c7b3b3fba..e62f29ecbe6f45 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -192,6 +192,10 @@ foreach(target 
aarch64-unknown-linux-gnu;armv7-unknown-linux-gnueabihf;i386-unkn
 set(RUNTIMES_${target}_LLVM_TOOLS_DIR "${CMAKE_BINARY_DIR}/bin" CACHE BOOL 
"")
 set(RUNTIMES_${target}_LLVM_ENABLE_RUNTIMES 
"compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
 
+# Enable FatLTO for Linux and baremetal runtimes
+set(RUNTIMES_${target}_LLVM_ENABLE_LTO ON CACHE BOOL "")
+set(RUNTIMES_${target}_LLVM_ENABLE_FATLTO ON CACHE BOOL "")
+
 # Use .build-id link.
 list(APPEND RUNTIME_BUILD_ID_LINK "${target}")
   endif()
@@ -274,6 +278,10 @@ if(FUCHSIA_SDK)
 set(RUNTIMES_${target}+asan+noexcept_LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE 
B

[llvm-branch-commits] [clang] [llvm] AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 (PR #116679)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/116679

>From c2e9801ef48929f73f6141c386b6169fa24c6c43 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 19 Dec 2023 12:46:00 +0700
Subject: [PATCH] AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950

Unlike the existing gfx940 intrinsics using short/i16 in place of
bfloat, this uses the natural bfloat type.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   2 +
 .../CodeGenOpenCL/builtins-amdgcn-mfma.cl |   6 +
 .../builtins-amdgcn-error-gfx950-param.cl |   7 +
 .../builtins-amdgcn-error-gfx950.cl   |   5 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   2 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   1 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |   6 +
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |   8 +
 .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll| 474 ++
 llvm/test/MC/AMDGPU/mai-gfx950.s  |  56 ++-
 .../MC/Disassembler/AMDGPU/gfx950_mai.txt |  27 +
 llvm/test/tools/llvm-mca/AMDGPU/gfx950.s  |  10 +-
 12 files changed, 596 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 6917d8d1aca69d..7ce8f2c1669d67 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -437,6 +437,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", 
"nc", "fp8-conversion-
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", 
"nc", "gfx950-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", 
"nc", "gfx950-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, 
"V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
+
 
//===--===//
 // GFX12+ only builtins.
 
//===--===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index a644a60f9ec381..841d8fcad0fee0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -24,6 +24,7 @@ typedef short  v8s   __attribute__((ext_vector_type(8)));
 typedef short  v16s  __attribute__((ext_vector_type(16)));
 typedef short  v32s  __attribute__((ext_vector_type(32)));
 typedef double v4d   __attribute__((ext_vector_type(4)));
+typedef __bf16 v8bf16   __attribute__((ext_vector_type(8)));
 
 
 #ifdef MFMA_GFX908_TESTS
@@ -424,5 +425,10 @@ v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
   return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
 }
 
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16(
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 
x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3)
+v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
+  return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
+}
 
 #endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 4c267e2cac5cad..4af67763c40dd2 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -4,6 +4,7 @@
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef float float16 __attribute__((ext_vector_type(16)));
 typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
 
 
 void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 
c, int X) {
@@ -19,3 +20,9 @@ void test_mfma_f32_32x32x16_f16(__global float16* out, half8 
a, half8 b, float16
   *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, X, 0); // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a 
constant integer}}
   *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, 0, X); // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a 
constant integer}}
 }
+
+void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, 
float16 c, int X) {
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, X, 0, 0); // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0);  // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X);  // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a 
constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl 
b/clang/test/S

[llvm-branch-commits] [clang] [llvm] AMDGPU: Add first gfx950 mfma instructions (PR #116312)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Nov 18, 4:29 PM EST**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/116312).


https://github.com/llvm/llvm-project/pull/116312
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [libcxx] [libcxxabi] [Fuchsia][cmake] Allow using FatLTO when building runtimes (PR #112277)

2024-11-18 Thread Paul Kirth via llvm-branch-commits

https://github.com/ilovepi edited 
https://github.com/llvm/llvm-project/pull/112277
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][Transforms] Support 1:N mappings in `ConversionValueMapping` (PR #116524)

2024-11-18 Thread Matthias Springer via llvm-branch-commits

https://github.com/matthias-springer updated 
https://github.com/llvm/llvm-project/pull/116524

>From 7025a8caae81e97022155b8fac8075fc29e24650 Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Sun, 17 Nov 2024 09:00:45 +0100
Subject: [PATCH 1/2] [mlir][LLVM] `LLVMTypeConverter`: Tighten materialization
 checks

---
 .../Conversion/LLVMCommon/TypeConverter.cpp   | 32 
 .../MemRefToLLVM/type-conversion.mlir | 57 ++
 mlir/test/lib/Dialect/LLVM/CMakeLists.txt |  1 +
 mlir/test/lib/Dialect/LLVM/TestPatterns.cpp   | 77 +++
 mlir/tools/mlir-opt/mlir-opt.cpp  |  2 +
 5 files changed, 154 insertions(+), 15 deletions(-)
 create mode 100644 mlir/test/Conversion/MemRefToLLVM/type-conversion.mlir
 create mode 100644 mlir/test/lib/Dialect/LLVM/TestPatterns.cpp

diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp 
b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index ce91424e7a577e..59b0f5c9b09bcd 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -153,6 +153,12 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
type.isVarArg());
   });
 
+  // Helper function that checks if the given value range is a bare pointer.
+  auto isBarePointer = [](ValueRange values) {
+return values.size() == 1 &&
+   isa(values.front().getType());
+  };
+
   // Argument materializations convert from the new block argument types
   // (multiple SSA values that make up a memref descriptor) back to the
   // original block argument type. The dialect conversion framework will then
@@ -161,11 +167,10 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
   addArgumentMaterialization([&](OpBuilder &builder,
  UnrankedMemRefType resultType,
  ValueRange inputs, Location loc) {
-if (inputs.size() == 1) {
-  // Bare pointers are not supported for unranked memrefs because a
-  // memref descriptor cannot be built just from a bare pointer.
+// Note: Bare pointers are not supported for unranked memrefs because a
+// memref descriptor cannot be built just from a bare pointer.
+if (TypeRange(inputs) != getUnrankedMemRefDescriptorFields())
   return Value();
-}
 Value desc =
 UnrankedMemRefDescriptor::pack(builder, loc, *this, resultType, 
inputs);
 // An argument materialization must return a value of type
@@ -177,20 +182,17 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
   addArgumentMaterialization([&](OpBuilder &builder, MemRefType resultType,
  ValueRange inputs, Location loc) {
 Value desc;
-if (inputs.size() == 1) {
-  // This is a bare pointer. We allow bare pointers only for function entry
-  // blocks.
-  BlockArgument barePtr = dyn_cast(inputs.front());
-  if (!barePtr)
-return Value();
-  Block *block = barePtr.getOwner();
-  if (!block->isEntryBlock() ||
-  !isa(block->getParentOp()))
-return Value();
+if (isBarePointer(inputs)) {
   desc = MemRefDescriptor::fromStaticShape(builder, loc, *this, resultType,
inputs[0]);
-} else {
+} else if (TypeRange(inputs) ==
+   getMemRefDescriptorFields(resultType,
+ /*unpackAggregates=*/true)) {
   desc = MemRefDescriptor::pack(builder, loc, *this, resultType, inputs);
+} else {
+  // The inputs are neither a bare pointer nor an unpacked memref
+  // descriptor. This materialization function cannot be used.
+  return Value();
 }
 // An argument materialization must return a value of type `resultType`,
 // so insert a cast from the memref descriptor type (!llvm.struct) to the
diff --git a/mlir/test/Conversion/MemRefToLLVM/type-conversion.mlir 
b/mlir/test/Conversion/MemRefToLLVM/type-conversion.mlir
new file mode 100644
index 00..0288aa11313c72
--- /dev/null
+++ b/mlir/test/Conversion/MemRefToLLVM/type-conversion.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-opt %s -test-llvm-legalize-patterns -split-input-file
+
+// Test the argument materializer for ranked MemRef types.
+
+//   CHECK-LABEL: func @construct_ranked_memref_descriptor(
+// CHECK:   llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x 
i64>, array<2 x i64>)>
+// CHECK-COUNT-7:   llvm.insertvalue
+// CHECK:   builtin.unrealized_conversion_cast %{{.*}} : 
!llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<5x4xf32>
+func.func @construct_ranked_memref_descriptor(%arg0: !llvm.ptr, %arg1: 
!llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64) {
+  %0 = "test.direct_replacement"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, 
%arg6) : (!llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64) -> (memref<5x4xf32>)
+  "test.legal_op"(%0) : (memref<5x

[llvm-branch-commits] [llvm] [NFC] Use unique_ptr in SparseSet (PR #116617)

2024-11-18 Thread Akshat Oke via llvm-branch-commits

https://github.com/optimisan ready_for_review 
https://github.com/llvm/llvm-project/pull/116617
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port SpillPlacement analysis to NPM (PR #116618)

2024-11-18 Thread Akshat Oke via llvm-branch-commits

optimisan wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/116618?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#116618** https://app.graphite.dev/github/pr/llvm/llvm-project/116618?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/116618?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#116617** https://app.graphite.dev/github/pr/llvm/llvm-project/116617?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116616** https://app.graphite.dev/github/pr/llvm/llvm-project/116616?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`



This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/116618
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][Transforms] Support 1:N mappings in `ConversionValueMapping` (PR #116524)

2024-11-18 Thread Matthias Springer via llvm-branch-commits

https://github.com/matthias-springer edited 
https://github.com/llvm/llvm-project/pull/116524
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NFC] Use unique_ptr in SparseSet (PR #116617)

2024-11-18 Thread Akshat Oke via llvm-branch-commits

optimisan wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/116617?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#116618** https://app.graphite.dev/github/pr/llvm/llvm-project/116618?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116617** https://app.graphite.dev/github/pr/llvm/llvm-project/116617?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/116617?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#116616** https://app.graphite.dev/github/pr/llvm/llvm-project/116616?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`



This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/116617
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port SpillPlacement analysis to NPM (PR #116618)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-regalloc

Author: Akshat Oke (optimisan)


Changes

I am not sure how to test this.

---
Full diff: https://github.com/llvm/llvm-project/pull/116618.diff


4 Files Affected:

- (modified) llvm/include/llvm/InitializePasses.h (+1-1) 
- (modified) llvm/lib/CodeGen/RegAllocGreedy.cpp (+3-3) 
- (modified) llvm/lib/CodeGen/SpillPlacement.cpp (+58-33) 
- (modified) llvm/lib/CodeGen/SpillPlacement.h (+42-10) 


``diff
diff --git a/llvm/include/llvm/InitializePasses.h 
b/llvm/include/llvm/InitializePasses.h
index fb8356b9c98cb9..728b178e0cdad7 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -289,7 +289,7 @@ void initializeSinkingLegacyPassPass(PassRegistry &);
 void initializeSjLjEHPreparePass(PassRegistry &);
 void initializeSlotIndexesWrapperPassPass(PassRegistry &);
 void initializeSpeculativeExecutionLegacyPassPass(PassRegistry &);
-void initializeSpillPlacementPass(PassRegistry &);
+void initializeSpillPlacementWrapperLegacyPass(PassRegistry &);
 void initializeStackColoringLegacyPass(PassRegistry &);
 void initializeStackFrameLayoutAnalysisPassPass(PassRegistry &);
 void initializeStackMapLivenessPass(PassRegistry &);
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp 
b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 3542bfe18af46f..3fdf2d6e07a75f 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -162,7 +162,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(EdgeBundlesWrapperLegacy)
-INITIALIZE_PASS_DEPENDENCY(SpillPlacement)
+INITIALIZE_PASS_DEPENDENCY(SpillPlacementWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
 INITIALIZE_PASS_DEPENDENCY(RegAllocEvictionAdvisorAnalysis)
 INITIALIZE_PASS_DEPENDENCY(RegAllocPriorityAdvisorAnalysis)
@@ -217,7 +217,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired();
   AU.addPreserved();
   AU.addRequired();
-  AU.addRequired();
+  AU.addRequired();
   AU.addRequired();
   AU.addRequired();
   AU.addRequired();
@@ -2731,7 +2731,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   ORE = &getAnalysis().getORE();
   Loops = &getAnalysis().getLI();
   Bundles = &getAnalysis().getEdgeBundles();
-  SpillPlacer = &getAnalysis();
+  SpillPlacer = &getAnalysis().getResult();
   DebugVars = &getAnalysis();
 
   initializeCSRCost();
diff --git a/llvm/lib/CodeGen/SpillPlacement.cpp 
b/llvm/lib/CodeGen/SpillPlacement.cpp
index 318e2b19322bb4..c9baabf6161d3a 100644
--- a/llvm/lib/CodeGen/SpillPlacement.cpp
+++ b/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -44,17 +44,17 @@ using namespace llvm;
 
 #define DEBUG_TYPE "spill-code-placement"
 
-char SpillPlacement::ID = 0;
+char SpillPlacementWrapperLegacy::ID = 0;
 
-char &llvm::SpillPlacementID = SpillPlacement::ID;
+char &llvm::SpillPlacementID = SpillPlacementWrapperLegacy::ID;
 
-INITIALIZE_PASS_BEGIN(SpillPlacement, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(SpillPlacementWrapperLegacy, DEBUG_TYPE,
   "Spill Code Placement Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(EdgeBundlesWrapperLegacy)
-INITIALIZE_PASS_END(SpillPlacement, DEBUG_TYPE,
+INITIALIZE_PASS_END(SpillPlacementWrapperLegacy, DEBUG_TYPE,
 "Spill Code Placement Analysis", true, true)
 
-void SpillPlacement::getAnalysisUsage(AnalysisUsage &AU) const {
+void SpillPlacementWrapperLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired();
   AU.addRequiredTransitive();
@@ -189,32 +189,57 @@ struct SpillPlacement::Node {
   }
 };
 
-bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
+bool SpillPlacementWrapperLegacy::runOnMachineFunction(MachineFunction &MF) {
+  auto *Bundles = &getAnalysis().getEdgeBundles();
+  auto *MBFI = &getAnalysis().getMBFI();
+
+  Impl.reset(new SpillPlacement(Bundles, MBFI));
+  Impl->run(MF);
+  return false;
+}
+
+AnalysisKey SpillPlacementAnalysis::Key;
+
+SpillPlacement
+SpillPlacementAnalysis::run(MachineFunction &MF,
+MachineFunctionAnalysisManager &MFAM) {
+  auto *Bundles = &MFAM.getResult(MF);
+  auto *MBFI = &MFAM.getResult(MF);
+  SpillPlacement Impl(Bundles, MBFI);
+  Impl.run(MF);
+  return Impl;
+}
+
+bool SpillPlacementAnalysis::Result::invalidate(
+MachineFunction &MF, const PreservedAnalyses &PA,
+MachineFunctionAnalysisManager::Invalidator &Inv) {
+  auto PAC = PA.getChecker();
+  return !(PAC.preserved() ||
+   PAC.preservedSet>()) ||
+ Inv.invalidate(MF, PA) ||
+ Inv.invalidate(MF, PA);
+}
+
+void SpillPlacement::arrayDeleter(Node *N) {
+  if (N)
+delete[] N;
+}
+
+void SpillPlacement::run(MachineFunction &mf) {
   MF = &mf;
-  bundles = &getAnalysis().getEdgeBundles();
 
   assert(!nodes && "Leaking node array");
-  nodes = 

[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port SpillPlacement analysis to NPM (PR #116618)

2024-11-18 Thread Akshat Oke via llvm-branch-commits

https://github.com/optimisan edited 
https://github.com/llvm/llvm-project/pull/116618
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port SpillPlacement analysis to NPM (PR #116618)

2024-11-18 Thread Akshat Oke via llvm-branch-commits

https://github.com/optimisan ready_for_review 
https://github.com/llvm/llvm-project/pull/116618
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][Transforms] Support 1:N mappings in `ConversionValueMapping` (PR #116524)

2024-11-18 Thread Matthias Springer via llvm-branch-commits

https://github.com/matthias-springer edited 
https://github.com/llvm/llvm-project/pull/116524
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port SpillPlacement analysis to NPM (PR #116618)

2024-11-18 Thread Akshat Oke via llvm-branch-commits

https://github.com/optimisan created 
https://github.com/llvm/llvm-project/pull/116618

None

>From c791eaa8768073b3ef770a59859346a859bd7a7f Mon Sep 17 00:00:00 2001
From: Akshat Oke 
Date: Mon, 18 Nov 2024 12:42:00 +
Subject: [PATCH] [CodeGen][NewPM] Port SpillPlacement analysis to NPM

---
 llvm/include/llvm/InitializePasses.h |  2 +-
 llvm/lib/CodeGen/RegAllocGreedy.cpp  |  6 +-
 llvm/lib/CodeGen/SpillPlacement.cpp  | 91 ++--
 llvm/lib/CodeGen/SpillPlacement.h| 52 +---
 4 files changed, 104 insertions(+), 47 deletions(-)

diff --git a/llvm/include/llvm/InitializePasses.h 
b/llvm/include/llvm/InitializePasses.h
index fb8356b9c98cb9..728b178e0cdad7 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -289,7 +289,7 @@ void initializeSinkingLegacyPassPass(PassRegistry &);
 void initializeSjLjEHPreparePass(PassRegistry &);
 void initializeSlotIndexesWrapperPassPass(PassRegistry &);
 void initializeSpeculativeExecutionLegacyPassPass(PassRegistry &);
-void initializeSpillPlacementPass(PassRegistry &);
+void initializeSpillPlacementWrapperLegacyPass(PassRegistry &);
 void initializeStackColoringLegacyPass(PassRegistry &);
 void initializeStackFrameLayoutAnalysisPassPass(PassRegistry &);
 void initializeStackMapLivenessPass(PassRegistry &);
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp 
b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 3542bfe18af46f..3fdf2d6e07a75f 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -162,7 +162,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(EdgeBundlesWrapperLegacy)
-INITIALIZE_PASS_DEPENDENCY(SpillPlacement)
+INITIALIZE_PASS_DEPENDENCY(SpillPlacementWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
 INITIALIZE_PASS_DEPENDENCY(RegAllocEvictionAdvisorAnalysis)
 INITIALIZE_PASS_DEPENDENCY(RegAllocPriorityAdvisorAnalysis)
@@ -217,7 +217,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired();
   AU.addPreserved();
   AU.addRequired();
-  AU.addRequired();
+  AU.addRequired();
   AU.addRequired();
   AU.addRequired();
   AU.addRequired();
@@ -2731,7 +2731,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   ORE = &getAnalysis().getORE();
   Loops = &getAnalysis().getLI();
   Bundles = &getAnalysis().getEdgeBundles();
-  SpillPlacer = &getAnalysis();
+  SpillPlacer = &getAnalysis().getResult();
   DebugVars = &getAnalysis();
 
   initializeCSRCost();
diff --git a/llvm/lib/CodeGen/SpillPlacement.cpp 
b/llvm/lib/CodeGen/SpillPlacement.cpp
index 318e2b19322bb4..c9baabf6161d3a 100644
--- a/llvm/lib/CodeGen/SpillPlacement.cpp
+++ b/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -44,17 +44,17 @@ using namespace llvm;
 
 #define DEBUG_TYPE "spill-code-placement"
 
-char SpillPlacement::ID = 0;
+char SpillPlacementWrapperLegacy::ID = 0;
 
-char &llvm::SpillPlacementID = SpillPlacement::ID;
+char &llvm::SpillPlacementID = SpillPlacementWrapperLegacy::ID;
 
-INITIALIZE_PASS_BEGIN(SpillPlacement, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(SpillPlacementWrapperLegacy, DEBUG_TYPE,
   "Spill Code Placement Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(EdgeBundlesWrapperLegacy)
-INITIALIZE_PASS_END(SpillPlacement, DEBUG_TYPE,
+INITIALIZE_PASS_END(SpillPlacementWrapperLegacy, DEBUG_TYPE,
 "Spill Code Placement Analysis", true, true)
 
-void SpillPlacement::getAnalysisUsage(AnalysisUsage &AU) const {
+void SpillPlacementWrapperLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired();
   AU.addRequiredTransitive();
@@ -189,32 +189,57 @@ struct SpillPlacement::Node {
   }
 };
 
-bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
+bool SpillPlacementWrapperLegacy::runOnMachineFunction(MachineFunction &MF) {
+  auto *Bundles = &getAnalysis().getEdgeBundles();
+  auto *MBFI = &getAnalysis().getMBFI();
+
+  Impl.reset(new SpillPlacement(Bundles, MBFI));
+  Impl->run(MF);
+  return false;
+}
+
+AnalysisKey SpillPlacementAnalysis::Key;
+
+SpillPlacement
+SpillPlacementAnalysis::run(MachineFunction &MF,
+MachineFunctionAnalysisManager &MFAM) {
+  auto *Bundles = &MFAM.getResult(MF);
+  auto *MBFI = &MFAM.getResult(MF);
+  SpillPlacement Impl(Bundles, MBFI);
+  Impl.run(MF);
+  return Impl;
+}
+
+bool SpillPlacementAnalysis::Result::invalidate(
+MachineFunction &MF, const PreservedAnalyses &PA,
+MachineFunctionAnalysisManager::Invalidator &Inv) {
+  auto PAC = PA.getChecker();
+  return !(PAC.preserved() ||
+   PAC.preservedSet>()) ||
+ Inv.invalidate(MF, PA) ||
+ Inv.invalidate(MF, PA);
+}
+
+void SpillPlacement::arrayDeleter(Node *N) {
+  if (N)
+delete[] N;
+}
+
+void SpillPlacement::run(MachineFunction &mf) {
   MF = 

[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port SpillPlacement analysis to NPM (PR #116618)

2024-11-18 Thread Akshat Oke via llvm-branch-commits

https://github.com/optimisan edited 
https://github.com/llvm/llvm-project/pull/116618
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NewPM] Introduce MFAnalysisGetter for a common analysis getter (PR #116166)

2024-11-18 Thread Akshat Oke via llvm-branch-commits

https://github.com/optimisan closed 
https://github.com/llvm/llvm-project/pull/116166
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [RISCV] Support __builtin_cpu_is (PR #116231)

2024-11-18 Thread Pengcheng Wang via llvm-branch-commits

https://github.com/wangpc-pp updated 
https://github.com/llvm/llvm-project/pull/116231

>From 9686a2c5c5276289e72d9098f497a9f246a1c457 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng 
Date: Thu, 14 Nov 2024 22:06:45 +0800
Subject: [PATCH 1/4] Remove stale CHECKs

Created using spr 1.3.6-beta.1
---
 clang/test/CodeGen/builtin-cpu-is.c | 20 
 1 file changed, 20 deletions(-)

diff --git a/clang/test/CodeGen/builtin-cpu-is.c 
b/clang/test/CodeGen/builtin-cpu-is.c
index e4a2071cf46795..b8dd97eeacebcf 100644
--- a/clang/test/CodeGen/builtin-cpu-is.c
+++ b/clang/test/CodeGen/builtin-cpu-is.c
@@ -7,8 +7,6 @@
 // global, the bit grab, and the icmp correct.
 extern void a(const char *);
 
-// CHECK: @__cpu_model = external dso_local global { i32, i32, i32, [1 x i32] }
-
 // CHECK-X86-LABEL: define dso_local void @intel(
 // CHECK-X86-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-X86-NEXT:  [[ENTRY:.*:]]
@@ -24,9 +22,6 @@ extern void a(const char *);
 void intel(void) {
   if (__builtin_cpu_is("intel"))
 a("intel");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr @__cpu_model
-  // CHECK: = icmp eq i32 [[LOAD]], 1
 }
 
 // CHECK-X86-LABEL: define dso_local void @amd(
@@ -44,9 +39,6 @@ void intel(void) {
 void amd(void) {
   if (__builtin_cpu_is("amd"))
 a("amd");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr @__cpu_model
-  // CHECK: = icmp eq i32 [[LOAD]], 2
 }
 
 // CHECK-X86-LABEL: define dso_local void @atom(
@@ -64,9 +56,6 @@ void amd(void) {
 void atom(void) {
   if (__builtin_cpu_is("atom"))
 a("atom");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr getelementptr inbounds ({ i32, 
i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 1)
-  // CHECK: = icmp eq i32 [[LOAD]], 1
 }
 
 // CHECK-X86-LABEL: define dso_local void @amdfam10h(
@@ -84,9 +73,6 @@ void atom(void) {
 void amdfam10h(void) {
   if (__builtin_cpu_is("amdfam10h"))
 a("amdfam10h");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr getelementptr inbounds ({ i32, 
i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 1)
-  // CHECK: = icmp eq i32 [[LOAD]], 4
 }
 
 // CHECK-X86-LABEL: define dso_local void @barcelona(
@@ -104,9 +90,6 @@ void amdfam10h(void) {
 void barcelona(void) {
   if (__builtin_cpu_is("barcelona"))
 a("barcelona");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr getelementptr inbounds ({ i32, 
i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 2)
-  // CHECK: = icmp eq i32 [[LOAD]], 4
 }
 
 // CHECK-X86-LABEL: define dso_local void @nehalem(
@@ -124,9 +107,6 @@ void barcelona(void) {
 void nehalem(void) {
   if (__builtin_cpu_is("nehalem"))
 a("nehalem");
-
-  // CHECK: [[LOAD:%[^ ]+]] = load i32, ptr getelementptr inbounds ({ i32, 
i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 2)
-  // CHECK: = icmp eq i32 [[LOAD]], 1
 }
 #endif
 

>From 2bb2d5079b5bf98ba9f87e082ca3e67ab70068aa Mon Sep 17 00:00:00 2001
From: Wang Pengcheng 
Date: Thu, 14 Nov 2024 22:12:36 +0800
Subject: [PATCH 2/4] Simplify test

Created using spr 1.3.6-beta.1
---
 clang/test/CodeGen/builtin-cpu-is.c | 25 ++---
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/clang/test/CodeGen/builtin-cpu-is.c 
b/clang/test/CodeGen/builtin-cpu-is.c
index b8dd97eeacebcf..8e78213a7cfcfb 100644
--- a/clang/test/CodeGen/builtin-cpu-is.c
+++ b/clang/test/CodeGen/builtin-cpu-is.c
@@ -111,12 +111,9 @@ void nehalem(void) {
 #endif
 
 #ifdef __riscv
-// CHECK-RV64-LABEL: define dso_local signext i32 @test_riscv(
-// CHECK-RV64-SAME: i32 noundef signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-LABEL: define dso_local signext i32 @test_cpu_is_veyron_v1(
+// CHECK-RV64-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  [[ENTRY:.*:]]
-// CHECK-RV64-NEXT:[[RETVAL:%.*]] = alloca i32, align 4
-// CHECK-RV64-NEXT:[[A_ADDR:%.*]] = alloca i32, align 4
-// CHECK-RV64-NEXT:store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK-RV64-NEXT:[[TMP0:%.*]] = load i32, ptr @__riscv_cpu_model, align 4
 // CHECK-RV64-NEXT:[[TMP1:%.*]] = icmp eq i32 [[TMP0]], 1567
 // CHECK-RV64-NEXT:[[TMP2:%.*]] = load i64, ptr getelementptr inbounds ({ 
i32, i64, i64 }, ptr @__riscv_cpu_model, i32 0, i32 1), align 8
@@ -125,20 +122,10 @@ void nehalem(void) {
 // CHECK-RV64-NEXT:[[TMP5:%.*]] = load i64, ptr getelementptr inbounds ({ 
i32, i64, i64 }, ptr @__riscv_cpu_model, i32 0, i32 2), align 8
 // CHECK-RV64-NEXT:[[TMP6:%.*]] = icmp eq i64 [[TMP5]], 273
 // CHECK-RV64-NEXT:[[TMP7:%.*]] = and i1 [[TMP4]], [[TMP6]]
-// CHECK-RV64-NEXT:br i1 [[TMP7]], label %[[IF_THEN:.*]], label 
%[[IF_END:.*]]
-// CHECK-RV64:   [[IF_THEN]]:
-// CHECK-RV64-NEXT:store i32 3, ptr [[RETVAL]], align 4
-// CHECK-RV64-NEXT:br label %[[RETURN:.*]]
-// CHECK-RV64:   [[IF_END]]:
-// CHECK-RV64-NEXT:store i32 0, ptr [[RETVAL]], align 4
-// CHECK-RV64-NEXT:br label %[[RETURN]]
-// CHECK-RV64:   [[RETURN]]:
-// CHECK-RV64-NEXT:[[TMP8:%.*]] = load i32, ptr [[RETVAL]], align 4
-// CHECK-RV64-NEXT:ret i32 [[TM

[llvm-branch-commits] [llvm] [NFC] Use unique_ptr in SparseSet (PR #116617)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-adt

Author: Akshat Oke (optimisan)


Changes

This allows implementing the move constructor.

---
Full diff: https://github.com/llvm/llvm-project/pull/116617.diff


1 Files Affected:

- (modified) llvm/include/llvm/ADT/SparseSet.h (+11-7) 


``diff
diff --git a/llvm/include/llvm/ADT/SparseSet.h 
b/llvm/include/llvm/ADT/SparseSet.h
index c7793117ff5408..1adae0d4595ac4 100644
--- a/llvm/include/llvm/ADT/SparseSet.h
+++ b/llvm/include/llvm/ADT/SparseSet.h
@@ -129,7 +129,12 @@ class SparseSet {
   using DenseT = SmallVector;
   using size_type = unsigned;
   DenseT Dense;
-  SparseT *Sparse = nullptr;
+
+  struct Deleter {
+void operator()(SparseT *S) { free(S); }
+  };
+  std::unique_ptr Sparse;
+
   unsigned Universe = 0;
   KeyFunctorT KeyIndexOf;
   SparseSetValFunctor ValIndexOf;
@@ -144,7 +149,7 @@ class SparseSet {
   SparseSet() = default;
   SparseSet(const SparseSet &) = delete;
   SparseSet &operator=(const SparseSet &) = delete;
-  ~SparseSet() { free(Sparse); }
+  SparseSet(SparseSet &&) = default;
 
   /// setUniverse - Set the universe size which determines the largest key the
   /// set can hold.  The universe must be sized before any elements can be
@@ -159,11 +164,10 @@ class SparseSet {
 // Hysteresis prevents needless reallocations.
 if (U >= Universe/4 && U <= Universe)
   return;
-free(Sparse);
 // The Sparse array doesn't actually need to be initialized, so malloc
 // would be enough here, but that will cause tools like valgrind to
 // complain about branching on uninitialized data.
-Sparse = static_cast(safe_calloc(U, sizeof(SparseT)));
+Sparse.reset(static_cast(safe_calloc(U, sizeof(SparseT;
 Universe = U;
   }
 
@@ -205,7 +209,7 @@ class SparseSet {
 assert(Idx < Universe && "Key out of range");
 assert(Sparse != nullptr && "Invalid sparse type");
 const unsigned Stride = std::numeric_limits::max() + 1u;
-for (unsigned i = Sparse[Idx], e = size(); i < e; i += Stride) {
+for (unsigned i = Sparse.get()[Idx], e = size(); i < e; i += Stride) {
   const unsigned FoundIdx = ValIndexOf(Dense[i]);
   assert(FoundIdx < Universe && "Invalid key in set. Did object mutate?");
   if (Idx == FoundIdx)
@@ -255,7 +259,7 @@ class SparseSet {
 iterator I = findIndex(Idx);
 if (I != end())
   return std::make_pair(I, false);
-Sparse[Idx] = size();
+Sparse.get()[Idx] = size();
 Dense.push_back(Val);
 return std::make_pair(end() - 1, true);
   }
@@ -292,7 +296,7 @@ class SparseSet {
   *I = Dense.back();
   unsigned BackIdx = ValIndexOf(Dense.back());
   assert(BackIdx < Universe && "Invalid key in set. Did object mutate?");
-  Sparse[BackIdx] = I - begin();
+  Sparse.get()[BackIdx] = I - begin();
 }
 // This depends on SmallVector::pop_back() not invalidating iterators.
 // std::vector::pop_back() doesn't give that guarantee.

``




https://github.com/llvm/llvm-project/pull/116617
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][Transforms] Support 1:N mappings in `ConversionValueMapping` (PR #116524)

2024-11-18 Thread Matthias Springer via llvm-branch-commits

https://github.com/matthias-springer updated 
https://github.com/llvm/llvm-project/pull/116524

>From e3946a5496cdf64ff6a8a5c7e1b117f4904ac9e5 Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Sun, 17 Nov 2024 04:38:09 +0100
Subject: [PATCH] [mlir][Transforms] Support 1:N mappings in
 `ConversionValueMapping`

---
 .../Conversion/LLVMCommon/TypeConverter.cpp   |  68 ++-
 .../Bufferization/Transforms/Bufferize.cpp|   1 -
 .../EmitC/Transforms/TypeConversions.cpp  |   1 -
 .../Dialect/Linalg/Transforms/Detensorize.cpp |   1 -
 .../Quant/Transforms/StripFuncQuantTypes.cpp  |   1 -
 .../Utils/SparseTensorDescriptor.cpp  |   3 -
 .../Vector/Transforms/VectorLinearize.cpp |   1 -
 .../Transforms/Utils/DialectConversion.cpp| 527 ++
 mlir/test/Transforms/test-legalizer.mlir  |   3 -
 .../Func/TestDecomposeCallGraphTypes.cpp  |   2 +-
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   |  11 +-
 .../lib/Transforms/TestDialectConversion.cpp  |   1 -
 12 files changed, 335 insertions(+), 285 deletions(-)

diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp 
b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index 59b0f5c9b09bcd..fbf1c20d0baa32 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -153,20 +153,31 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
type.isVarArg());
   });
 
+  // Add generic source and target materializations to handle cases where
+  // non-LLVM types persist after an LLVM conversion.
+  addSourceMaterialization([&](OpBuilder &builder, Type resultType,
+   ValueRange inputs, Location loc) {
+return builder.create(loc, resultType, inputs)
+.getResult(0);
+  });
+  addTargetMaterialization([&](OpBuilder &builder, Type resultType,
+   ValueRange inputs, Location loc) {
+return builder.create(loc, resultType, inputs)
+.getResult(0);
+  });
+
   // Helper function that checks if the given value range is a bare pointer.
   auto isBarePointer = [](ValueRange values) {
 return values.size() == 1 &&
isa(values.front().getType());
   };
 
-  // Argument materializations convert from the new block argument types
-  // (multiple SSA values that make up a memref descriptor) back to the
-  // original block argument type. The dialect conversion framework will then
-  // insert a target materialization from the original block argument type to
-  // a legal type.
-  addArgumentMaterialization([&](OpBuilder &builder,
- UnrankedMemRefType resultType,
- ValueRange inputs, Location loc) {
+  // Source materializations convert the MemrRef descriptor elements
+  // (multiple SSA values that make up a MemrRef descriptor) back to the
+  // original MemRef type.
+  addSourceMaterialization([&](OpBuilder &builder,
+   UnrankedMemRefType resultType, ValueRange 
inputs,
+   Location loc) {
 // Note: Bare pointers are not supported for unranked memrefs because a
 // memref descriptor cannot be built just from a bare pointer.
 if (TypeRange(inputs) != getUnrankedMemRefDescriptorFields())
@@ -179,8 +190,8 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
 return builder.create(loc, resultType, desc)
 .getResult(0);
   });
-  addArgumentMaterialization([&](OpBuilder &builder, MemRefType resultType,
- ValueRange inputs, Location loc) {
+  addSourceMaterialization([&](OpBuilder &builder, MemRefType resultType,
+   ValueRange inputs, Location loc) {
 Value desc;
 if (isBarePointer(inputs)) {
   desc = MemRefDescriptor::fromStaticShape(builder, loc, *this, resultType,
@@ -200,23 +211,30 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
 return builder.create(loc, resultType, desc)
 .getResult(0);
   });
-  // Add generic source and target materializations to handle cases where
-  // non-LLVM types persist after an LLVM conversion.
-  addSourceMaterialization([&](OpBuilder &builder, Type resultType,
-   ValueRange inputs, Location loc) {
-if (inputs.size() != 1)
-  return Value();
+  addTargetMaterialization([&](OpBuilder &builder,
+   LLVM::LLVMStructType resultType,
+   ValueRange inputs, Location loc,
+   Type originalType) -> Value {
+if (auto memrefType = dyn_cast_or_null(originalType)) {
+  if (isBarePointer(inputs)) {
+return MemRefDescriptor::fromStaticShape(builder, loc, *this,
+ memrefType, inputs[0]);
+  } else if (TypeRange(inputs) ==
+ getMemRefDescriptorFields(memrefType,
+

[llvm-branch-commits] [mlir] [mlir][LLVM] `LLVMTypeConverter`: Tighten materialization checks (PR #116532)

2024-11-18 Thread Matthias Springer via llvm-branch-commits

https://github.com/matthias-springer updated 
https://github.com/llvm/llvm-project/pull/116532

>From 7025a8caae81e97022155b8fac8075fc29e24650 Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Sun, 17 Nov 2024 09:00:45 +0100
Subject: [PATCH] [mlir][LLVM] `LLVMTypeConverter`: Tighten materialization
 checks

---
 .../Conversion/LLVMCommon/TypeConverter.cpp   | 32 
 .../MemRefToLLVM/type-conversion.mlir | 57 ++
 mlir/test/lib/Dialect/LLVM/CMakeLists.txt |  1 +
 mlir/test/lib/Dialect/LLVM/TestPatterns.cpp   | 77 +++
 mlir/tools/mlir-opt/mlir-opt.cpp  |  2 +
 5 files changed, 154 insertions(+), 15 deletions(-)
 create mode 100644 mlir/test/Conversion/MemRefToLLVM/type-conversion.mlir
 create mode 100644 mlir/test/lib/Dialect/LLVM/TestPatterns.cpp

diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp 
b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index ce91424e7a577e..59b0f5c9b09bcd 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -153,6 +153,12 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
type.isVarArg());
   });
 
+  // Helper function that checks if the given value range is a bare pointer.
+  auto isBarePointer = [](ValueRange values) {
+return values.size() == 1 &&
+   isa(values.front().getType());
+  };
+
   // Argument materializations convert from the new block argument types
   // (multiple SSA values that make up a memref descriptor) back to the
   // original block argument type. The dialect conversion framework will then
@@ -161,11 +167,10 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
   addArgumentMaterialization([&](OpBuilder &builder,
  UnrankedMemRefType resultType,
  ValueRange inputs, Location loc) {
-if (inputs.size() == 1) {
-  // Bare pointers are not supported for unranked memrefs because a
-  // memref descriptor cannot be built just from a bare pointer.
+// Note: Bare pointers are not supported for unranked memrefs because a
+// memref descriptor cannot be built just from a bare pointer.
+if (TypeRange(inputs) != getUnrankedMemRefDescriptorFields())
   return Value();
-}
 Value desc =
 UnrankedMemRefDescriptor::pack(builder, loc, *this, resultType, 
inputs);
 // An argument materialization must return a value of type
@@ -177,20 +182,17 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
   addArgumentMaterialization([&](OpBuilder &builder, MemRefType resultType,
  ValueRange inputs, Location loc) {
 Value desc;
-if (inputs.size() == 1) {
-  // This is a bare pointer. We allow bare pointers only for function entry
-  // blocks.
-  BlockArgument barePtr = dyn_cast(inputs.front());
-  if (!barePtr)
-return Value();
-  Block *block = barePtr.getOwner();
-  if (!block->isEntryBlock() ||
-  !isa(block->getParentOp()))
-return Value();
+if (isBarePointer(inputs)) {
   desc = MemRefDescriptor::fromStaticShape(builder, loc, *this, resultType,
inputs[0]);
-} else {
+} else if (TypeRange(inputs) ==
+   getMemRefDescriptorFields(resultType,
+ /*unpackAggregates=*/true)) {
   desc = MemRefDescriptor::pack(builder, loc, *this, resultType, inputs);
+} else {
+  // The inputs are neither a bare pointer nor an unpacked memref
+  // descriptor. This materialization function cannot be used.
+  return Value();
 }
 // An argument materialization must return a value of type `resultType`,
 // so insert a cast from the memref descriptor type (!llvm.struct) to the
diff --git a/mlir/test/Conversion/MemRefToLLVM/type-conversion.mlir 
b/mlir/test/Conversion/MemRefToLLVM/type-conversion.mlir
new file mode 100644
index 00..0288aa11313c72
--- /dev/null
+++ b/mlir/test/Conversion/MemRefToLLVM/type-conversion.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-opt %s -test-llvm-legalize-patterns -split-input-file
+
+// Test the argument materializer for ranked MemRef types.
+
+//   CHECK-LABEL: func @construct_ranked_memref_descriptor(
+// CHECK:   llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x 
i64>, array<2 x i64>)>
+// CHECK-COUNT-7:   llvm.insertvalue
+// CHECK:   builtin.unrealized_conversion_cast %{{.*}} : 
!llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<5x4xf32>
+func.func @construct_ranked_memref_descriptor(%arg0: !llvm.ptr, %arg1: 
!llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64) {
+  %0 = "test.direct_replacement"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, 
%arg6) : (!llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64) -> (memref<5x4xf32>)
+  "test.legal_op"(%0) : (memref<5x4xf3

[llvm-branch-commits] [lld] [PAC][lld][AArch64][ELF] Support signed GOT with tiny code model (PR #113816)

2024-11-18 Thread Daniil Kovalev via llvm-branch-commits

https://github.com/kovdan01 updated 
https://github.com/llvm/llvm-project/pull/113816

>From ff01757ad3d20b9538a23b12e7c3e2cd7f6dc20d Mon Sep 17 00:00:00 2001
From: Daniil Kovalev 
Date: Fri, 25 Oct 2024 21:28:18 +0300
Subject: [PATCH 1/3] [PAC][lld][AArch64][ELF] Support signed GOT with tiny
 code model

Support `R_AARCH64_AUTH_GOT_ADR_PREL_LO21` and `R_AARCH64_AUTH_GOT_LD_PREL19`
GOT-generating relocations.
---
 lld/ELF/Arch/AArch64.cpp |  5 ++
 lld/ELF/InputSection.cpp |  1 +
 lld/ELF/Relocations.cpp  | 17 ++---
 lld/ELF/Relocations.h|  1 +
 lld/test/ELF/aarch64-got-relocations-pauth.s | 73 
 5 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 076351dd00d3b3..94e79fdf1025ce 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -205,6 +205,9 @@ RelExpr AArch64::getRelExpr(RelType type, const Symbol &s,
   case R_AARCH64_AUTH_LD64_GOT_LO12_NC:
   case R_AARCH64_AUTH_GOT_ADD_LO12_NC:
 return R_AARCH64_AUTH_GOT;
+  case R_AARCH64_AUTH_GOT_LD_PREL19:
+  case R_AARCH64_AUTH_GOT_ADR_PREL_LO21:
+return R_AARCH64_AUTH_GOT_PC;
   case R_AARCH64_LD64_GOTPAGE_LO15:
 return R_AARCH64_GOT_PAGE;
   case R_AARCH64_ADR_GOT_PAGE:
@@ -549,6 +552,7 @@ void AArch64::relocate(uint8_t *loc, const Relocation &rel,
 write32AArch64Addr(loc, val >> 12);
 break;
   case R_AARCH64_ADR_PREL_LO21:
+  case R_AARCH64_AUTH_GOT_ADR_PREL_LO21:
 checkInt(ctx, loc, val, 21, rel);
 write32AArch64Addr(loc, val);
 break;
@@ -569,6 +573,7 @@ void AArch64::relocate(uint8_t *loc, const Relocation &rel,
   case R_AARCH64_CONDBR19:
   case R_AARCH64_LD_PREL_LO19:
   case R_AARCH64_GOT_LD_PREL19:
+  case R_AARCH64_AUTH_GOT_LD_PREL19:
 checkAlignment(ctx, loc, val, 4, rel);
 checkInt(ctx, loc, val, 21, rel);
 writeMaskedBits32le(loc, (val & 0x1C) << 3, 0x1C << 3);
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 84f23bf78a4e9b..d49a654c6a29b7 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -796,6 +796,7 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const 
Relocation &r,
   case R_AARCH64_GOT_PAGE:
 return r.sym->getGotVA(ctx) + a - getAArch64Page(ctx.in.got->getVA());
   case R_GOT_PC:
+  case R_AARCH64_AUTH_GOT_PC:
   case R_RELAX_TLS_GD_TO_IE:
 return r.sym->getGotVA(ctx) + a - p;
   case R_GOTPLT_GOTREL:
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index bbb3d3210e0253..d783b5f0a674cf 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -210,11 +210,11 @@ static bool needsPlt(RelExpr expr) {
 }
 
 bool lld::elf::needsGot(RelExpr expr) {
-  return oneof(
-  expr);
+  return oneof(expr);
 }
 
 // True if this expression is of the form Sym - X, where X is a position in the
@@ -1010,8 +1010,8 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr 
e, RelType type,
 R_GOTONLY_PC, R_GOTPLTONLY_PC, R_PLT_PC, R_PLT_GOTREL, 
R_PLT_GOTPLT,
 R_GOTPLT_GOTREL, R_GOTPLT_PC, R_PPC32_PLTREL, R_PPC64_CALL_PLT,
 R_PPC64_RELAX_TOC, R_RISCV_ADD, R_AARCH64_GOT_PAGE,
-R_AARCH64_AUTH_GOT, R_LOONGARCH_PLT_PAGE_PC, R_LOONGARCH_GOT,
-R_LOONGARCH_GOT_PAGE_PC>(e))
+R_AARCH64_AUTH_GOT, R_AARCH64_AUTH_GOT_PC, R_LOONGARCH_PLT_PAGE_PC,
+R_LOONGARCH_GOT, R_LOONGARCH_GOT_PAGE_PC>(e))
 return true;
 
   // These never do, except if the entire file is position dependent or if
@@ -1126,7 +1126,8 @@ void RelocationScanner::processAux(RelExpr expr, RelType 
type, uint64_t offset,
   // Many LoongArch TLS relocs reuse the R_LOONGARCH_GOT type, in which
   // case the NEEDS_GOT flag shouldn't get set.
   bool needsGotAuth =
-  (expr == R_AARCH64_AUTH_GOT || expr == R_AARCH64_AUTH_GOT_PAGE_PC);
+  (expr == R_AARCH64_AUTH_GOT || expr == R_AARCH64_AUTH_GOT_PC ||
+   expr == R_AARCH64_AUTH_GOT_PAGE_PC);
   uint16_t flags = sym.flags.load(std::memory_order_relaxed);
   if (!(flags & NEEDS_GOT)) {
 sym.setFlags(needsGotAuth ? (NEEDS_GOT | NEEDS_GOT_AUTH) : NEEDS_GOT);
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index 20d88de402ac18..38d55d46116569 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -89,6 +89,7 @@ enum RelExpr {
   R_AARCH64_AUTH_GOT_PAGE_PC,
   R_AARCH64_GOT_PAGE,
   R_AARCH64_AUTH_GOT,
+  R_AARCH64_AUTH_GOT_PC,
   R_AARCH64_PAGE_PC,
   R_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC,
   R_AARCH64_TLSDESC_PAGE,
diff --git a/lld/test/ELF/aarch64-got-relocations-pauth.s 
b/lld/test/ELF/aarch64-got-relocations-pauth.s
index 3fe73a086c729b..14f03958482dff 100644
--- a/lld/test/ELF/aarch64-got-relocations-pauth.s
+++ b/lld/test/ELF/aarch64-got-relocations-pauth.s
@@ -78,6 +78,79 @@ _start:
   adrp x1, :got_auth:zed
   add  x1, x1, :got_auth_lo12:zed
 
+#--- ok-tiny.s
+
+# RUN: ll

[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 global_load_lds_* instructions (PR #116680)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/116680

Define global_load_lds_dwordx3 and global_load_dwordx4.
Oddly it seems dwordx2 was skipped.

>From 42f311ceb555ea2b3f171ad2ef8254e971e0be12 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Thu, 18 Jan 2024 14:44:03 +0700
Subject: [PATCH] AMDGPU: Handle gfx950 global_load_lds_* instructions

Define global_load_lds_dwordx3 and global_load_dwordx4.
Oddly it seems dwordx2 was skipped.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   2 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp  |  10 ++
 llvm/lib/Target/AMDGPU/FLATInstructions.td|   9 ++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |   7 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  10 ++
 .../llvm.amdgcn.global.load.lds.gfx950.ll | 137 ++
 llvm/test/MC/AMDGPU/gfx950_asm_features.s |  37 +
 llvm/test/MC/Disassembler/AMDGPU/gfx950.txt   |  25 
 8 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll
 create mode 100644 llvm/test/MC/AMDGPU/gfx950_asm_features.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx950.txt

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 15f33cdbf92e6e..f43ab50d2ea441 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2452,7 +2452,7 @@ class AMDGPUGlobalLoadLDS :
 [],
 [LLVMQualPointerType<1>,// Base global pointer to load from
  LLVMQualPointerType<3>,// LDS base pointer to store to
- llvm_i32_ty,   // Data byte size: 1/2/4
+ llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for 
gfx950)
  llvm_i32_ty,   // imm offset (applied to both global 
and LDS address)
  llvm_i32_ty],  // auxiliary data (imm, cachepolicy 
(bit 0 = sc0,
 //   
bit 1 = sc1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 13de93e829fab2..a6ef0069f134bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3329,6 +3329,16 @@ bool 
AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
   case 4:
 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
 break;
+  case 12:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
+break;
+  case 16:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
+break;
   }
 
   MachineBasicBlock *MBB = MI.getParent();
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index db74372e9db452..861fcf017d9e4d 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -934,6 +934,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_usho
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_sshort">;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dword">;
 
+let SubtargetPredicate = HasGFX950Insts in {
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dwordx3">;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dwordx4">;
+}
+
 let SubtargetPredicate = isGFX12Plus in {
   defm GLOBAL_ATOMIC_COND_SUB_U32: FLAT_Global_Atomic_Pseudo 
<"global_atomic_cond_sub_u32", VGPR_32, i32>;
   defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo 
<"global_atomic_ordered_add_b64", VReg_64, i64>;
@@ -1980,6 +1985,10 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS 
<0x028, 0x12>;
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Real_AllAddr_LDS <0x02a, 0x14>;
 
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Real_AllAddr_LDS <0x07e, 0x07e>;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Real_AllAddr_LDS <0x07d, 0x07d>;
+
+
 defm GLOBAL_ATOMIC_SWAP   : FLAT_Global_Real_Atomics_vi <0x40>;
 defm GLOBAL_ATOMIC_CMPSWAP: FLAT_Global_Real_Atomics_vi <0x41>;
 defm GLOBAL_ATOMIC_ADD: FLAT_Global_Real_Atomics_vi <0x42>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4a6efe533230b1..f3f96940c1f44b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1289,6 +1289,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // hasGFX940Insts and hasGFX90AInsts are also true.
   bool hasGFX950Insts() const { return GFX950Insts; }
 
+  /// Returns true if the target supports
+  /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
+  /// buffer_load_dwo

[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds (PR #116681)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/116681?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#116681** https://app.graphite.dev/github/pr/llvm/llvm-project/116681?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/116681?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#116680** https://app.graphite.dev/github/pr/llvm/llvm-project/116680?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116679** https://app.graphite.dev/github/pr/llvm/llvm-project/116679?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116678** https://app.graphite.dev/github/pr/llvm/llvm-project/116678?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116312** https://app.graphite.dev/github/pr/llvm/llvm-project/116312?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116311** https://app.graphite.dev/github/pr/llvm/llvm-project/116311?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116310** https://app.graphite.dev/github/pr/llvm/llvm-project/116310?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116309** https://app.graphite.dev/github/pr/llvm/llvm-project/116309?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116308** https://app.graphite.dev/github/pr/llvm/llvm-project/116308?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116307** https://app.graphite.dev/github/pr/llvm/llvm-project/116307?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`



This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/116681
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 (PR #116679)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/116679?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#116680** https://app.graphite.dev/github/pr/llvm/llvm-project/116680?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116679** https://app.graphite.dev/github/pr/llvm/llvm-project/116679?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/116679?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#116678** https://app.graphite.dev/github/pr/llvm/llvm-project/116678?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116312** https://app.graphite.dev/github/pr/llvm/llvm-project/116312?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116311** https://app.graphite.dev/github/pr/llvm/llvm-project/116311?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116310** https://app.graphite.dev/github/pr/llvm/llvm-project/116310?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116309** https://app.graphite.dev/github/pr/llvm/llvm-project/116309?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116308** https://app.graphite.dev/github/pr/llvm/llvm-project/116308?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116307** https://app.graphite.dev/github/pr/llvm/llvm-project/116307?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`



This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/116679
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 global_load_lds_* instructions (PR #116680)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/116680?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#116681** https://app.graphite.dev/github/pr/llvm/llvm-project/116681?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116680** https://app.graphite.dev/github/pr/llvm/llvm-project/116680?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/116680?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#116679** https://app.graphite.dev/github/pr/llvm/llvm-project/116679?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116678** https://app.graphite.dev/github/pr/llvm/llvm-project/116678?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116312** https://app.graphite.dev/github/pr/llvm/llvm-project/116312?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116311** https://app.graphite.dev/github/pr/llvm/llvm-project/116311?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116310** https://app.graphite.dev/github/pr/llvm/llvm-project/116310?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116309** https://app.graphite.dev/github/pr/llvm/llvm-project/116309?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116308** https://app.graphite.dev/github/pr/llvm/llvm-project/116308?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116307** https://app.graphite.dev/github/pr/llvm/llvm-project/116307?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`



This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/116680
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 (PR #116679)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/116679

Unlike the existing gfx940 intrinsics using short/i16 in place of
bfloat, this uses the natural bfloat type.

>From 82bb6e07b68b1df378e89c1eba1f9deb3c2d67f5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 19 Dec 2023 12:46:00 +0700
Subject: [PATCH] AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950

Unlike the existing gfx940 intrinsics using short/i16 in place of
bfloat, this uses the natural bfloat type.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   2 +
 .../CodeGenOpenCL/builtins-amdgcn-mfma.cl |   6 +
 .../builtins-amdgcn-error-gfx950-param.cl |   7 +
 .../builtins-amdgcn-error-gfx950.cl   |   5 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   2 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   1 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |   6 +
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |   8 +
 .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll| 474 ++
 llvm/test/MC/AMDGPU/mai-gfx950.s  |  56 ++-
 .../MC/Disassembler/AMDGPU/gfx950_mai.txt |  27 +
 llvm/test/tools/llvm-mca/AMDGPU/gfx950.s  |  10 +-
 12 files changed, 596 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 6917d8d1aca69d..7ce8f2c1669d67 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -437,6 +437,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", 
"nc", "fp8-conversion-
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", 
"nc", "gfx950-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", 
"nc", "gfx950-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, 
"V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
+
 
//===--===//
 // GFX12+ only builtins.
 
//===--===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index a644a60f9ec381..841d8fcad0fee0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -24,6 +24,7 @@ typedef short  v8s   __attribute__((ext_vector_type(8)));
 typedef short  v16s  __attribute__((ext_vector_type(16)));
 typedef short  v32s  __attribute__((ext_vector_type(32)));
 typedef double v4d   __attribute__((ext_vector_type(4)));
+typedef __bf16 v8bf16   __attribute__((ext_vector_type(8)));
 
 
 #ifdef MFMA_GFX908_TESTS
@@ -424,5 +425,10 @@ v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
   return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
 }
 
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16(
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 
x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3)
+v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
+  return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
+}
 
 #endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 4c267e2cac5cad..4af67763c40dd2 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -4,6 +4,7 @@
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef float float16 __attribute__((ext_vector_type(16)));
 typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
 
 
 void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 
c, int X) {
@@ -19,3 +20,9 @@ void test_mfma_f32_32x32x16_f16(__global float16* out, half8 
a, half8 b, float16
   *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, X, 0); // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a 
constant integer}}
   *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, 0, X); // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a 
constant integer}}
 }
+
+void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, 
float16 c, int X) {
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, X, 0, 0); // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0);  // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X);  // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must 

[llvm-branch-commits] [llvm] AMDGPU: Add V_CVT_PK_BF16_F32 for gfx950 (PR #116678)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/116678?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#116679** https://app.graphite.dev/github/pr/llvm/llvm-project/116679?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116678** https://app.graphite.dev/github/pr/llvm/llvm-project/116678?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/116678?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#116312** https://app.graphite.dev/github/pr/llvm/llvm-project/116312?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116311** https://app.graphite.dev/github/pr/llvm/llvm-project/116311?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116310** https://app.graphite.dev/github/pr/llvm/llvm-project/116310?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116309** https://app.graphite.dev/github/pr/llvm/llvm-project/116309?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116308** https://app.graphite.dev/github/pr/llvm/llvm-project/116308?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116307** https://app.graphite.dev/github/pr/llvm/llvm-project/116307?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`



This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/116678
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 (PR #116679)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/116679?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#116680** https://app.graphite.dev/github/pr/llvm/llvm-project/116680?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116679** https://app.graphite.dev/github/pr/llvm/llvm-project/116679?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/116679?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#116678** https://app.graphite.dev/github/pr/llvm/llvm-project/116678?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116312** https://app.graphite.dev/github/pr/llvm/llvm-project/116312?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116311** https://app.graphite.dev/github/pr/llvm/llvm-project/116311?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116310** https://app.graphite.dev/github/pr/llvm/llvm-project/116310?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116309** https://app.graphite.dev/github/pr/llvm/llvm-project/116309?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116308** https://app.graphite.dev/github/pr/llvm/llvm-project/116308?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#116307** https://app.graphite.dev/github/pr/llvm/llvm-project/116307?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`



This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/116679
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds (PR #116681)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/116681

Enforcing this limit in the clang builtin will come later.

>From f5657c9cc25cfed321ced807510a21dc374bcfe3 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Thu, 18 Jan 2024 16:18:05 +0700
Subject: [PATCH] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds

Enforcing this limit in the clang builtin will come later.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   8 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp  |  18 ++
 llvm/lib/Target/AMDGPU/BUFInstructions.td |  24 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  16 ++
 .../llvm.amdgcn.global.load.lds.gfx950.ll |   8 +
 ...m.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll | 176 
 ...mdgcn.struct.ptr.buffer.load.lds.gfx950.ll | 196 ++
 llvm/test/MC/AMDGPU/mubuf-gfx950.s|  32 +++
 llvm/test/MC/Disassembler/AMDGPU/gfx950.txt   |  19 ++
 9 files changed, 485 insertions(+), 12 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll
 create mode 100644 llvm/test/MC/AMDGPU/mubuf-gfx950.s

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f43ab50d2ea441..360af786c5160d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1674,7 +1674,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
llvm_i32_ty,   // imm offset(imm, included in bounds checking 
and swizzling)
@@ -1693,7 +1693,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,// rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
llvm_i32_ty,   // imm offset(imm, included in bounds checking 
and swizzling)
@@ -1715,7 +1715,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // vindex(VGPR)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
@@ -1735,7 +1735,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,// rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // vindex(VGPR)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a6ef0069f134bd..3522ece24f1c45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3240,6 +3240,24 @@ bool 
AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
  : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
 break;
+  case 12:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+
+Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+break;
+  case 16:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+
+Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+ : AMDG

[llvm-branch-commits] [clang] [llvm] AMDGPU: Add first gfx950 mfma instructions (PR #116312)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/116312

>From 566cdf85a2a03fc41148715593081643570d6ded Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 21 Nov 2023 10:03:19 +0900
Subject: [PATCH] AMDGPU: Add first gfx950 mfma instructions

Scheduling info and hazards are wrong and TBD.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   6 +
 .../CodeGenOpenCL/builtins-amdgcn-mfma.cl |  25 +-
 .../builtins-amdgcn-error-gfx950-param.cl |  21 ++
 .../builtins-amdgcn-error-gfx950.cl   |  12 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   9 +
 llvm/lib/Target/AMDGPU/AMDGPU.td  |   4 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   4 +-
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   2 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |   4 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   4 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  22 ++
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |  17 ++
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 274 ++
 llvm/test/MC/AMDGPU/mai-gfx950.s  | 112 +++
 .../MC/Disassembler/AMDGPU/gfx950_mai.txt |  61 
 llvm/test/tools/llvm-mca/AMDGPU/gfx950.s  |  18 ++
 16 files changed, 592 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
 create mode 100644 llvm/test/MC/AMDGPU/mai-gfx950.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
 create mode 100644 llvm/test/tools/llvm-mca/AMDGPU/gfx950.s

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 61516eb2a4a723..6917d8d1aca69d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -431,6 +431,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", 
"nc", "fp8-conversion-
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", 
"fp8-conversion-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", 
"fp8-conversion-insts")
 
+//===--===//
+// GFX950 only builtins.
+//===--===//
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", 
"nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", 
"nc", "gfx950-insts")
+
 
//===--===//
 // GFX12+ only builtins.
 
//===--===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index dcdeee6b6acc40..a644a60f9ec381 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -2,6 +2,7 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 
-DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX908
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a 
-DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX90A
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 
-DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX940
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 
-DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX950
 
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 
@@ -222,7 +223,7 @@ void test_mfma_f64_4x4x4f64(global double* out, double a, 
double b, double c)
 
 #endif // MFMA_GFX90A_TESTS
 
-#ifdef MFMA_GFX940_TESTS
+#if defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
 // CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8
 // CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 
%b, <4 x i32> %c, i32 0, i32 0, i32 0)
 void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c)
@@ -404,4 +405,24 @@ void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, 
v2i a, v4i b, v16f c, in
 {
   *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0);
 }
-#endif // MFMA_GFX940_TESTS
+#endif // defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
+
+#ifdef MFMA_GFX950_TESTS
+
+// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_f16(
+// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x 
half> %a, <8 x half> %b, <4 x float> %c, i32 1, i32 2, i32 3)
+
+v4f test_mfma_f32_16x16x32_f16(v8h a, v8h b, v4f c)
+{
+  return __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 1, 2, 3);
+}
+
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_f16
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32

[llvm-branch-commits] [llvm] AMDGPU: Add V_CVT_PK_BF16_F32 for gfx950 (PR #116678)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/116678

None

>From f3682aa080aebde46106fa11176442973ff62c26 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap 
Date: Mon, 5 Feb 2024 04:29:01 -0500
Subject: [PATCH] AMDGPU: Add V_CVT_PK_BF16_F32 for gfx950

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |   6 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   1 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  25 ++
 llvm/test/CodeGen/AMDGPU/bf16-conversions.ll  | 395 --
 llvm/test/MC/AMDGPU/gfx950_asm_vop3.s |  26 ++
 .../Disassembler/AMDGPU/gfx950_dasm_vop3.txt  |  19 +
 6 files changed, 255 insertions(+), 217 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/gfx950_asm_vop3.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1e261f4256c93b..ad89812558d25c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -889,6 +889,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::MUL, MVT::i1, Promote);
 
+  if (Subtarget->hasBF16ConversionInsts()) {
+setOperationAction(ISD::FP_ROUND, MVT::v2bf16, Legal);
+setOperationAction(ISD::FP_ROUND, MVT::bf16, Legal);
+setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
+  }
+
   setTargetDAGCombine({ISD::ADD,
ISD::UADDO_CARRY,
ISD::SUB,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 882e147dc231fa..7df9be5c6f7a0b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2787,6 +2787,7 @@ def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, 
untyped]>;
 def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], 
/*EnableClamp=*/1>;
 def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
 def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
+def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
 
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td 
b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 551e8b3a679202..917e1b3974b46a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -944,6 +944,30 @@ let SubtargetPredicate = isGFX11Plus in {
   defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", 
VOP3_Profile>;
 } // End SubtargetPredicate = isGFX11Plus
 
+// FIXME: GlobalISel cannot distinguish f16 and bf16 and may start using bf16 
patterns
+//instead of less complex f16. Disable GlobalISel for these for now.
+def bf16_fpround : PatFrag <(ops node:$src0),  (fpround $src0), [{ return 
true; }]> {
+  let GISelPredicateCode = [{return false;}];
+}
+
+let SubtargetPredicate = HasBF16ConversionInsts in {
+  let ReadsModeReg = 0 in {
+defm V_CVT_PK_BF16_F32: VOP3Inst<"v_cvt_pk_bf16_f32", 
VOP3_Profile>;
+  }
+  def : GCNPat<(v2bf16 (bf16_fpround v2f32:$src)),
+   (V_CVT_PK_BF16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 
0, (EXTRACT_SUBREG VReg_64:$src, sub1))>;
+  def : GCNPat<(v2bf16 (bf16_fpround v2f64:$src)),
+   (V_CVT_PK_BF16_F32_e64 0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG 
VReg_128:$src, sub0_sub1)),
+  0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG 
VReg_128:$src, sub2_sub3)))>;
+  def : GCNPat<(v2bf16 (build_vector (bf16 (bf16_fpround (f32 (VOP3Mods 
f32:$src0, i32:$src0_modifiers,
+ (bf16 (bf16_fpround (f32 (VOP3Mods 
f32:$src1, i32:$src1_modifiers)),
+   (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, 
$src1)>;
+  def : GCNPat<(bf16 (bf16_fpround (f32 (VOP3Mods f32:$src0, 
i32:$src0_modifiers,
+   (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 
(IMPLICIT_DEF)))>;
+  def : GCNPat<(bf16 (bf16_fpround (f64 (VOP3Mods f64:$src0, 
i32:$src0_modifiers,
+   (V_CVT_PK_BF16_F32_e64 0, (f32 (V_CVT_F32_F64_e64 
$src0_modifiers, $src0)), 0, (f32 (IMPLICIT_DEF)))>;
+}
+
 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
   defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", 
VOP3_Profile>;
   defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", 
VOP3_Profile>;
@@ -1721,5 +1745,6 @@ defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
 
 defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>;
 defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>;
+defm V_CVT_PK_BF16_F32: VOP3OpSel_Real_gfx9 <0x268>;
 defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>;
 defm V_CVT_SR_BF8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a5>;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll 
b/llvm/test/CodeGen/AMDGPU/bf16-conversions.l

[llvm-branch-commits] [clang] [llvm] AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 (PR #116679)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes

Unlike the existing gfx940 intrinsics using short/i16 in place of
bfloat, this uses the natural bfloat type.

---

Patch is 40.83 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/116679.diff


12 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+2) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl (+6) 
- (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl (+7) 
- (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl (+4-1) 
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+2) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1) 
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+6) 
- (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+8) 
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll (+474) 
- (modified) llvm/test/MC/AMDGPU/mai-gfx950.s (+52-4) 
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt (+27) 
- (modified) llvm/test/tools/llvm-mca/AMDGPU/gfx950.s (+7-3) 


``diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 6917d8d1aca69d..7ce8f2c1669d67 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -437,6 +437,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", 
"nc", "fp8-conversion-
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", 
"nc", "gfx950-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", 
"nc", "gfx950-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, 
"V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
+
 
//===--===//
 // GFX12+ only builtins.
 
//===--===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index a644a60f9ec381..841d8fcad0fee0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -24,6 +24,7 @@ typedef short  v8s   __attribute__((ext_vector_type(8)));
 typedef short  v16s  __attribute__((ext_vector_type(16)));
 typedef short  v32s  __attribute__((ext_vector_type(32)));
 typedef double v4d   __attribute__((ext_vector_type(4)));
+typedef __bf16 v8bf16   __attribute__((ext_vector_type(8)));
 
 
 #ifdef MFMA_GFX908_TESTS
@@ -424,5 +425,10 @@ v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
   return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
 }
 
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16(
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 
x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3)
+v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
+  return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
+}
 
 #endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 4c267e2cac5cad..4af67763c40dd2 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -4,6 +4,7 @@
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef float float16 __attribute__((ext_vector_type(16)));
 typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
 
 
 void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 
c, int X) {
@@ -19,3 +20,9 @@ void test_mfma_f32_32x32x16_f16(__global float16* out, half8 
a, half8 b, float16
   *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, X, 0); // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a 
constant integer}}
   *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, 0, X); // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a 
constant integer}}
 }
+
+void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, 
float16 c, int X) {
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, X, 0, 0); // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0);  // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X);  // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a 
constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx

[llvm-branch-commits] [clang] [llvm] AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 (PR #116679)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:



@llvm/pr-subscribers-mc
@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-clang

Author: Matt Arsenault (arsenm)


Changes

Unlike the existing gfx940 intrinsics using short/i16 in place of
bfloat, this uses the natural bfloat type.

---

Patch is 40.83 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/116679.diff


12 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+2) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl (+6) 
- (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl (+7) 
- (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl (+4-1) 
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+2) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1) 
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+6) 
- (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+8) 
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll (+474) 
- (modified) llvm/test/MC/AMDGPU/mai-gfx950.s (+52-4) 
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt (+27) 
- (modified) llvm/test/tools/llvm-mca/AMDGPU/gfx950.s (+7-3) 


``diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 6917d8d1aca69d..7ce8f2c1669d67 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -437,6 +437,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", 
"nc", "fp8-conversion-
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", 
"nc", "gfx950-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", 
"nc", "gfx950-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, 
"V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
+
 
//===--===//
 // GFX12+ only builtins.
 
//===--===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index a644a60f9ec381..841d8fcad0fee0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -24,6 +24,7 @@ typedef short  v8s   __attribute__((ext_vector_type(8)));
 typedef short  v16s  __attribute__((ext_vector_type(16)));
 typedef short  v32s  __attribute__((ext_vector_type(32)));
 typedef double v4d   __attribute__((ext_vector_type(4)));
+typedef __bf16 v8bf16   __attribute__((ext_vector_type(8)));
 
 
 #ifdef MFMA_GFX908_TESTS
@@ -424,5 +425,10 @@ v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
   return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
 }
 
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16(
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 
x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3)
+v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
+  return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
+}
 
 #endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 4c267e2cac5cad..4af67763c40dd2 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -4,6 +4,7 @@
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef float float16 __attribute__((ext_vector_type(16)));
 typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
 
 
 void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 
c, int X) {
@@ -19,3 +20,9 @@ void test_mfma_f32_32x32x16_f16(__global float16* out, half8 
a, half8 b, float16
   *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, X, 0); // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a 
constant integer}}
   *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, 0, X); // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a 
constant integer}}
 }
+
+void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, 
float16 c, int X) {
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, X, 0, 0); // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0);  // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X);  // 
expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a 
constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl 
b/cla

[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 global_load_lds_* instructions (PR #116680)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/116680
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add V_CVT_PK_BF16_F32 for gfx950 (PR #116678)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes



---

Patch is 27.17 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/116678.diff


6 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+6) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1) 
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+25) 
- (modified) llvm/test/CodeGen/AMDGPU/bf16-conversions.ll (+178-217) 
- (added) llvm/test/MC/AMDGPU/gfx950_asm_vop3.s (+26) 
- (added) llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt (+19) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1e261f4256c93b..ad89812558d25c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -889,6 +889,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::MUL, MVT::i1, Promote);
 
+  if (Subtarget->hasBF16ConversionInsts()) {
+setOperationAction(ISD::FP_ROUND, MVT::v2bf16, Legal);
+setOperationAction(ISD::FP_ROUND, MVT::bf16, Legal);
+setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
+  }
+
   setTargetDAGCombine({ISD::ADD,
ISD::UADDO_CARRY,
ISD::SUB,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 882e147dc231fa..7df9be5c6f7a0b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2787,6 +2787,7 @@ def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, 
untyped]>;
 def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], 
/*EnableClamp=*/1>;
 def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
 def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
+def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
 
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td 
b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 551e8b3a679202..917e1b3974b46a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -944,6 +944,30 @@ let SubtargetPredicate = isGFX11Plus in {
   defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", 
VOP3_Profile>;
 } // End SubtargetPredicate = isGFX11Plus
 
+// FIXME: GlobalISel cannot distinguish f16 and bf16 and may start using bf16 
patterns
+//instead of less complex f16. Disable GlobalISel for these for now.
+def bf16_fpround : PatFrag <(ops node:$src0),  (fpround $src0), [{ return 
true; }]> {
+  let GISelPredicateCode = [{return false;}];
+}
+
+let SubtargetPredicate = HasBF16ConversionInsts in {
+  let ReadsModeReg = 0 in {
+defm V_CVT_PK_BF16_F32: VOP3Inst<"v_cvt_pk_bf16_f32", 
VOP3_Profile>;
+  }
+  def : GCNPat<(v2bf16 (bf16_fpround v2f32:$src)),
+   (V_CVT_PK_BF16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 
0, (EXTRACT_SUBREG VReg_64:$src, sub1))>;
+  def : GCNPat<(v2bf16 (bf16_fpround v2f64:$src)),
+   (V_CVT_PK_BF16_F32_e64 0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG 
VReg_128:$src, sub0_sub1)),
+  0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG 
VReg_128:$src, sub2_sub3)))>;
+  def : GCNPat<(v2bf16 (build_vector (bf16 (bf16_fpround (f32 (VOP3Mods 
f32:$src0, i32:$src0_modifiers,
+ (bf16 (bf16_fpround (f32 (VOP3Mods 
f32:$src1, i32:$src1_modifiers)),
+   (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, 
$src1)>;
+  def : GCNPat<(bf16 (bf16_fpround (f32 (VOP3Mods f32:$src0, 
i32:$src0_modifiers,
+   (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 
(IMPLICIT_DEF)))>;
+  def : GCNPat<(bf16 (bf16_fpround (f64 (VOP3Mods f64:$src0, 
i32:$src0_modifiers,
+   (V_CVT_PK_BF16_F32_e64 0, (f32 (V_CVT_F32_F64_e64 
$src0_modifiers, $src0)), 0, (f32 (IMPLICIT_DEF)))>;
+}
+
 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
   defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", 
VOP3_Profile>;
   defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", 
VOP3_Profile>;
@@ -1721,5 +1745,6 @@ defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
 
 defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>;
 defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>;
+defm V_CVT_PK_BF16_F32: VOP3OpSel_Real_gfx9 <0x268>;
 defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>;
 defm V_CVT_SR_BF8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a5>;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll 
b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index 425fc5884cec7f..135efceb31fdda 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -24,139 +24,168 @@ define amd

[llvm-branch-commits] [clang] [llvm] AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 (PR #116679)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/116679
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add V_CVT_PK_BF16_F32 for gfx950 (PR #116678)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/116678
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds (PR #116681)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-ir

Author: Matt Arsenault (arsenm)


Changes

Enforcing this limit in the clang builtin will come later.

---

Patch is 31.94 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/116681.diff


9 Files Affected:

- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+4-4) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+18) 
- (modified) llvm/lib/Target/AMDGPU/BUFInstructions.td (+16-8) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+16) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll 
(+8) 
- (added) 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll (+176) 
- (added) 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll 
(+196) 
- (added) llvm/test/MC/AMDGPU/mubuf-gfx950.s (+32) 
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx950.txt (+19) 


``diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f43ab50d2ea441..360af786c5160d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1674,7 +1674,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
llvm_i32_ty,   // imm offset(imm, included in bounds checking 
and swizzling)
@@ -1693,7 +1693,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,// rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
llvm_i32_ty,   // imm offset(imm, included in bounds checking 
and swizzling)
@@ -1715,7 +1715,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // vindex(VGPR)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
@@ -1735,7 +1735,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,// rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // vindex(VGPR)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a6ef0069f134bd..3522ece24f1c45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3240,6 +3240,24 @@ bool 
AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
  : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
 break;
+  case 12:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+
+Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+break;
+  case 16:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+
+Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+break;
   }
 
   MachineBasicBlock *MBB = MI.getParent();
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 79d6a825f60b03..7283733dea22db 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstr

[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds (PR #116681)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: Matt Arsenault (arsenm)


Changes

Enforcing this limit in the clang builtin will come later.

---

Patch is 31.94 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/116681.diff


9 Files Affected:

- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+4-4) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+18) 
- (modified) llvm/lib/Target/AMDGPU/BUFInstructions.td (+16-8) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+16) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll 
(+8) 
- (added) 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll (+176) 
- (added) 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll 
(+196) 
- (added) llvm/test/MC/AMDGPU/mubuf-gfx950.s (+32) 
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx950.txt (+19) 


``diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f43ab50d2ea441..360af786c5160d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1674,7 +1674,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
llvm_i32_ty,   // imm offset(imm, included in bounds checking 
and swizzling)
@@ -1693,7 +1693,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,// rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
llvm_i32_ty,   // imm offset(imm, included in bounds checking 
and swizzling)
@@ -1715,7 +1715,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // vindex(VGPR)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
@@ -1735,7 +1735,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,// rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // vindex(VGPR)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a6ef0069f134bd..3522ece24f1c45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3240,6 +3240,24 @@ bool 
AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
  : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
 break;
+  case 12:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+
+Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+break;
+  case 16:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+
+Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+break;
   }
 
   MachineBasicBlock *MBB = MI.getParent();
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 79d6a825f60b03..7283733dea22db 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstruc

[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds (PR #116681)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:



@llvm/pr-subscribers-mc

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes

Enforcing this limit in the clang builtin will come later.

---

Patch is 31.94 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/116681.diff


9 Files Affected:

- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+4-4) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+18) 
- (modified) llvm/lib/Target/AMDGPU/BUFInstructions.td (+16-8) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+16) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll 
(+8) 
- (added) 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll (+176) 
- (added) 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll 
(+196) 
- (added) llvm/test/MC/AMDGPU/mubuf-gfx950.s (+32) 
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx950.txt (+19) 


``diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f43ab50d2ea441..360af786c5160d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1674,7 +1674,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
llvm_i32_ty,   // imm offset(imm, included in bounds checking 
and swizzling)
@@ -1693,7 +1693,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,// rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
llvm_i32_ty,   // imm offset(imm, included in bounds checking 
and swizzling)
@@ -1715,7 +1715,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // vindex(VGPR)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
@@ -1735,7 +1735,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,// rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // vindex(VGPR)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a6ef0069f134bd..3522ece24f1c45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3240,6 +3240,24 @@ bool 
AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
  : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
 break;
+  case 12:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+
+Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+break;
+  case 16:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+
+Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+break;
   }
 
   MachineBasicBlock *MBB = MI.getParent();
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 79d6a825f60b03..7283733dea22db 100644
--- a/

[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 global_load_lds_* instructions (PR #116680)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:



@llvm/pr-subscribers-llvm-analysis

@llvm/pr-subscribers-clang

Author: Matt Arsenault (arsenm)


Changes

Define global_load_lds_dwordx3 and global_load_dwordx4.
Oddly it seems dwordx2 was skipped.

---
Full diff: https://github.com/llvm/llvm-project/pull/116680.diff


8 Files Affected:

- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+1-1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+10) 
- (modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (+9) 
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+7) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+10) 
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll (+137) 
- (added) llvm/test/MC/AMDGPU/gfx950_asm_features.s (+37) 
- (added) llvm/test/MC/Disassembler/AMDGPU/gfx950.txt (+25) 


``diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 15f33cdbf92e6e..f43ab50d2ea441 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2452,7 +2452,7 @@ class AMDGPUGlobalLoadLDS :
 [],
 [LLVMQualPointerType<1>,// Base global pointer to load from
  LLVMQualPointerType<3>,// LDS base pointer to store to
- llvm_i32_ty,   // Data byte size: 1/2/4
+ llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for 
gfx950)
  llvm_i32_ty,   // imm offset (applied to both global 
and LDS address)
  llvm_i32_ty],  // auxiliary data (imm, cachepolicy 
(bit 0 = sc0,
 //   
bit 1 = sc1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 13de93e829fab2..a6ef0069f134bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3329,6 +3329,16 @@ bool 
AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
   case 4:
 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
 break;
+  case 12:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
+break;
+  case 16:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
+break;
   }
 
   MachineBasicBlock *MBB = MI.getParent();
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index db74372e9db452..861fcf017d9e4d 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -934,6 +934,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_usho
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_sshort">;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dword">;
 
+let SubtargetPredicate = HasGFX950Insts in {
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dwordx3">;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dwordx4">;
+}
+
 let SubtargetPredicate = isGFX12Plus in {
   defm GLOBAL_ATOMIC_COND_SUB_U32: FLAT_Global_Atomic_Pseudo 
<"global_atomic_cond_sub_u32", VGPR_32, i32>;
   defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo 
<"global_atomic_ordered_add_b64", VReg_64, i64>;
@@ -1980,6 +1985,10 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS 
<0x028, 0x12>;
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Real_AllAddr_LDS <0x02a, 0x14>;
 
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Real_AllAddr_LDS <0x07e, 0x07e>;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Real_AllAddr_LDS <0x07d, 0x07d>;
+
+
 defm GLOBAL_ATOMIC_SWAP   : FLAT_Global_Real_Atomics_vi <0x40>;
 defm GLOBAL_ATOMIC_CMPSWAP: FLAT_Global_Real_Atomics_vi <0x41>;
 defm GLOBAL_ATOMIC_ADD: FLAT_Global_Real_Atomics_vi <0x42>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4a6efe533230b1..f3f96940c1f44b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1289,6 +1289,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // hasGFX940Insts and hasGFX90AInsts are also true.
   bool hasGFX950Insts() const { return GFX950Insts; }
 
+  /// Returns true if the target supports
+  /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
+  /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
+  bool hasLDSLoadB96_B128() const {
+return hasGFX950Insts();
+  }
+
   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
 
   bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ad8981

[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds (PR #116681)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/116681
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds (PR #116681)

2024-11-18 Thread via llvm-branch-commits

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:



You can test this locally with the following command:


``bash
git-clang-format --diff 42f311ceb555ea2b3f171ad2ef8254e971e0be12 
f5657c9cc25cfed321ced807510a21dc374bcfe3 --extensions cpp -- 
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
``





View the diff from clang-format here.


``diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3522ece24f..707136409e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3244,19 +3244,19 @@ bool 
AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
 if (!Subtarget->hasLDSLoadB96_B128())
   return false;
 
-Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
- : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
-: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
- : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+Opc = HasVIndex? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+  : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+   : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
 break;
   case 16:
 if (!Subtarget->hasLDSLoadB96_B128())
   return false;
 
-Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
- : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
-: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
- : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+Opc = HasVIndex? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+  : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+   : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
 break;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b02f9bf80..1763c1f0aa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9828,18 +9828,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue 
Op,
 case 12:
   if (!Subtarget->hasLDSLoadB96_B128())
 return SDValue();
-  Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
-   : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
-  : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
-   : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+  Opc = HasVIndex? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+  : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
   break;
 case 16:
   if (!Subtarget->hasLDSLoadB96_B128())
 return SDValue();
-  Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
-   : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
-  : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
-   : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+  Opc = HasVIndex? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+  : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
   break;
 }
 

``




https://github.com/llvm/llvm-project/pull/116681
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 global_load_lds_* instructions (PR #116680)

2024-11-18 Thread via llvm-branch-commits

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:



You can test this locally with the following command:


``bash
git-clang-format --diff 82bb6e07b68b1df378e89c1eba1f9deb3c2d67f5 
42f311ceb555ea2b3f171ad2ef8254e971e0be12 --extensions cpp,h -- 
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
llvm/lib/Target/AMDGPU/GCNSubtarget.h llvm/lib/Target/AMDGPU/SIISelLowering.cpp
``





View the diff from clang-format here.


``diff
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f3f96940c1..b27f9a0612 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1292,9 +1292,7 @@ public:
   /// Returns true if the target supports
   /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
   /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
-  bool hasLDSLoadB96_B128() const {
-return hasGFX950Insts();
-  }
+  bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
 
   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
 

``




https://github.com/llvm/llvm-project/pull/116680
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port SpillPlacement analysis to NPM (PR #116618)

2024-11-18 Thread via llvm-branch-commits


@@ -223,7 +248,7 @@ void SpillPlacement::activate(unsigned n) {
   if (ActiveNodes->test(n))
 return;
   ActiveNodes->set(n);
-  nodes[n].clear(Threshold);
+  nodes.get()[n].clear(Threshold);

paperchalice wrote:

You can use `operator []` if `nodes` is an array form `unique_ptr`.

https://github.com/llvm/llvm-project/pull/116618
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][Transforms] Add 1:N `matchAndRewrite` overload (PR #116470)

2024-11-18 Thread Markus Böck via llvm-branch-commits

https://github.com/zero9178 approved this pull request.

LGTM, thank you!

https://github.com/llvm/llvm-project/pull/116470
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] release/19.x: Reapply "[sanitizer_common] AND signals in BlockSignals instead of deleting (#113443)" for non-Android Linux only (#115790) (PR #116670)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-compiler-rt-sanitizer

Author: None (llvmbot)


Changes

Backport 531acf9e2f24977d2556b39229b22f4518a1faa5

Requested by: @thurstond

---
Full diff: https://github.com/llvm/llvm-project/pull/116670.diff


3 Files Affected:

- (modified) compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp (+39-16) 
- (modified) compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt (+1) 
- (added) compiler-rt/lib/sanitizer_common/tests/sanitizer_block_signals.cpp 
(+76) 


``diff
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp 
b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index b9b1f496df7c98..be3b3bd94e2a58 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -160,33 +160,56 @@ void SetSigProcMask(__sanitizer_sigset_t *set, 
__sanitizer_sigset_t *oldset) {
   CHECK_EQ(0, internal_sigprocmask(SIG_SETMASK, set, oldset));
 }
 
+#  if SANITIZER_LINUX
+// Deletes the specified signal from newset, if it is not present in oldset
+// Equivalently: newset[signum] = newset[signum] & oldset[signum]
+static void KeepUnblocked(__sanitizer_sigset_t &newset,
+  __sanitizer_sigset_t &oldset, int signum) {
+  // FIXME: https://github.com/google/sanitizers/issues/1816
+  if (SANITIZER_ANDROID || !internal_sigismember(&oldset, signum))
+internal_sigdelset(&newset, signum);
+}
+#  endif
+
 // Block asynchronous signals
 void BlockSignals(__sanitizer_sigset_t *oldset) {
-  __sanitizer_sigset_t set;
-  internal_sigfillset(&set);
-#  if SANITIZER_LINUX && !SANITIZER_ANDROID
+  __sanitizer_sigset_t newset;
+  internal_sigfillset(&newset);
+
+#  if SANITIZER_LINUX
+  __sanitizer_sigset_t currentset;
+
+#if !SANITIZER_ANDROID
+  // FIXME: https://github.com/google/sanitizers/issues/1816
+  SetSigProcMask(NULL, ¤tset);
+
   // Glibc uses SIGSETXID signal during setuid call. If this signal is blocked
   // on any thread, setuid call hangs.
   // See test/sanitizer_common/TestCases/Linux/setuid.c.
-  internal_sigdelset(&set, 33);
-#  endif
-#  if SANITIZER_LINUX
+  KeepUnblocked(newset, currentset, 33);
+#endif  // !SANITIZER_ANDROID
+
   // Seccomp-BPF-sandboxed processes rely on SIGSYS to handle trapped syscalls.
   // If this signal is blocked, such calls cannot be handled and the process 
may
   // hang.
-  internal_sigdelset(&set, 31);
+  KeepUnblocked(newset, currentset, 31);
 
+#if !SANITIZER_ANDROID
   // Don't block synchronous signals
-  internal_sigdelset(&set, SIGSEGV);
-  internal_sigdelset(&set, SIGBUS);
-  internal_sigdelset(&set, SIGILL);
-  internal_sigdelset(&set, SIGTRAP);
-  internal_sigdelset(&set, SIGABRT);
-  internal_sigdelset(&set, SIGFPE);
-  internal_sigdelset(&set, SIGPIPE);
-#  endif
+  // but also don't unblock signals that the user had deliberately blocked.
+  // FIXME: https://github.com/google/sanitizers/issues/1816
+  KeepUnblocked(newset, currentset, SIGSEGV);
+  KeepUnblocked(newset, currentset, SIGBUS);
+  KeepUnblocked(newset, currentset, SIGILL);
+  KeepUnblocked(newset, currentset, SIGTRAP);
+  KeepUnblocked(newset, currentset, SIGABRT);
+  KeepUnblocked(newset, currentset, SIGFPE);
+  KeepUnblocked(newset, currentset, SIGPIPE);
+#endif  //! SANITIZER_ANDROID
+
+#  endif  // SANITIZER_LINUX
 
-  SetSigProcMask(&set, oldset);
+  SetSigProcMask(&newset, oldset);
 }
 
 ScopedBlockSignals::ScopedBlockSignals(__sanitizer_sigset_t *copy) {
diff --git a/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt 
b/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
index 2b4c15125263a9..fef8bb772e0e0d 100644
--- a/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
@@ -15,6 +15,7 @@ set(SANITIZER_UNITTESTS
   sanitizer_array_ref_test.cpp
   sanitizer_atomic_test.cpp
   sanitizer_bitvector_test.cpp
+  sanitizer_block_signals.cpp
   sanitizer_bvgraph_test.cpp
   sanitizer_chained_origin_depot_test.cpp
   sanitizer_common_test.cpp
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_block_signals.cpp 
b/compiler-rt/lib/sanitizer_common/tests/sanitizer_block_signals.cpp
new file mode 100644
index 00..b43648a8aef230
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_block_signals.cpp
@@ -0,0 +1,76 @@
+//===-- sanitizer_block_signals.cpp 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This file is a part of sanitizer_common unit tests.
+//
+//===--===//
+#include 
+#include 
+
+#include "gtest/gtest.h"
+#include "sanitizer_common/sanitizer_linux.h"
+
+namespace __sanitizer {

[llvm-branch-commits] [compiler-rt] release/19.x: Reapply "[sanitizer_common] AND signals in BlockSignals instead of deleting (#113443)" for non-Android Linux only (#115790) (PR #116670)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:

@thurstond What do you think about merging this PR to the release branch?

https://github.com/llvm/llvm-project/pull/116670
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [RISCV] Support __builtin_cpu_is (PR #116231)

2024-11-18 Thread Craig Topper via llvm-branch-commits

https://github.com/topperc edited 
https://github.com/llvm/llvm-project/pull/116231
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [RISCV] Support __builtin_cpu_is (PR #116231)

2024-11-18 Thread Craig Topper via llvm-branch-commits


@@ -22505,6 +22506,47 @@ Value 
*CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID,
   return nullptr;
 }
 
+Value *CodeGenFunction::EmitRISCVCpuIs(const CallExpr *E) {
+  const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
+  StringRef CPUStr = cast(CPUExpr)->getString();
+  return EmitRISCVCpuIs(CPUStr);
+}
+
+Value *CodeGenFunction::EmitRISCVCpuIs(StringRef CPUStr) {
+  llvm::Type *Int32Ty = Builder.getInt32Ty();
+  llvm::Type *Int64Ty = Builder.getInt64Ty();
+  llvm::StructType *StructTy = llvm::StructType::get(Int32Ty, Int64Ty, 
Int64Ty);
+  llvm::Constant *RISCVCPUModel =
+  CGM.CreateRuntimeVariable(StructTy, "__riscv_cpu_model");
+  cast(RISCVCPUModel)->setDSOLocal(true);
+
+  auto loadRISCVCPUID = [&](unsigned Index) {
+Value *Ptr = Builder.CreateStructGEP(StructTy, RISCVCPUModel, Index);
+Value *CPUID = Builder.CreateAlignedLoad(StructTy->getTypeAtIndex(Index),

topperc wrote:

You can use `CreateLoad` to avoid llvm::MaybeAlign()

https://github.com/llvm/llvm-project/pull/116231
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] release/19.x: Reapply "[sanitizer_common] AND signals in BlockSignals instead of deleting (#113443)" for non-Android Linux only (#115790) (PR #116670)

2024-11-18 Thread via llvm-branch-commits

https://github.com/llvmbot created 
https://github.com/llvm/llvm-project/pull/116670

Backport 531acf9e2f24977d2556b39229b22f4518a1faa5

Requested by: @thurstond

>From 6925f3c7c7d8b83e2195cb8e473eccdecae42607 Mon Sep 17 00:00:00 2001
From: Thurston Dang 
Date: Thu, 14 Nov 2024 10:35:35 -0800
Subject: [PATCH] Reapply "[sanitizer_common] AND signals in BlockSignals
 instead of deleting (#113443)" for non-Android Linux only (#115790)

The original patch (25fd366d6a7d40266ff27c134ed8beb0a90cc33b) was
reverted in 083a5cdbeab09517d8345868970d4f41170d7ed2 because it broke
some buildbots.

This revised patch makes two changes:
- Reverts to *pre-#98200* behavior for Android. This avoids a build
breakage on Android.
- Only define KeepUnblocked if SANITIZER_LINUX: this avoids a build
breakage on solaris, which does not support internal_sigdelset.
N.B. Other buildbot failures were non-sanitizer tests and are therefore
unrelated.

Original commit message:
My earlier patch https://github.com/llvm/llvm-project/pull/98200
caused a regression because it unconditionally unblocked synchronous
signals, even if the user program had deliberately blocked them.
This patch fixes the issue by checking the current signal mask, as
suggested by Vitaly. It also adds tests.
Fixes #113385

(cherry picked from commit 531acf9e2f24977d2556b39229b22f4518a1faa5)
---
 .../lib/sanitizer_common/sanitizer_linux.cpp  | 55 ++
 .../lib/sanitizer_common/tests/CMakeLists.txt |  1 +
 .../tests/sanitizer_block_signals.cpp | 76 +++
 3 files changed, 116 insertions(+), 16 deletions(-)
 create mode 100644 
compiler-rt/lib/sanitizer_common/tests/sanitizer_block_signals.cpp

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp 
b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index b9b1f496df7c98..be3b3bd94e2a58 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -160,33 +160,56 @@ void SetSigProcMask(__sanitizer_sigset_t *set, 
__sanitizer_sigset_t *oldset) {
   CHECK_EQ(0, internal_sigprocmask(SIG_SETMASK, set, oldset));
 }
 
+#  if SANITIZER_LINUX
+// Deletes the specified signal from newset, if it is not present in oldset
+// Equivalently: newset[signum] = newset[signum] & oldset[signum]
+static void KeepUnblocked(__sanitizer_sigset_t &newset,
+  __sanitizer_sigset_t &oldset, int signum) {
+  // FIXME: https://github.com/google/sanitizers/issues/1816
+  if (SANITIZER_ANDROID || !internal_sigismember(&oldset, signum))
+internal_sigdelset(&newset, signum);
+}
+#  endif
+
 // Block asynchronous signals
 void BlockSignals(__sanitizer_sigset_t *oldset) {
-  __sanitizer_sigset_t set;
-  internal_sigfillset(&set);
-#  if SANITIZER_LINUX && !SANITIZER_ANDROID
+  __sanitizer_sigset_t newset;
+  internal_sigfillset(&newset);
+
+#  if SANITIZER_LINUX
+  __sanitizer_sigset_t currentset;
+
+#if !SANITIZER_ANDROID
+  // FIXME: https://github.com/google/sanitizers/issues/1816
+  SetSigProcMask(NULL, ¤tset);
+
   // Glibc uses SIGSETXID signal during setuid call. If this signal is blocked
   // on any thread, setuid call hangs.
   // See test/sanitizer_common/TestCases/Linux/setuid.c.
-  internal_sigdelset(&set, 33);
-#  endif
-#  if SANITIZER_LINUX
+  KeepUnblocked(newset, currentset, 33);
+#endif  // !SANITIZER_ANDROID
+
   // Seccomp-BPF-sandboxed processes rely on SIGSYS to handle trapped syscalls.
   // If this signal is blocked, such calls cannot be handled and the process 
may
   // hang.
-  internal_sigdelset(&set, 31);
+  KeepUnblocked(newset, currentset, 31);
 
+#if !SANITIZER_ANDROID
   // Don't block synchronous signals
-  internal_sigdelset(&set, SIGSEGV);
-  internal_sigdelset(&set, SIGBUS);
-  internal_sigdelset(&set, SIGILL);
-  internal_sigdelset(&set, SIGTRAP);
-  internal_sigdelset(&set, SIGABRT);
-  internal_sigdelset(&set, SIGFPE);
-  internal_sigdelset(&set, SIGPIPE);
-#  endif
+  // but also don't unblock signals that the user had deliberately blocked.
+  // FIXME: https://github.com/google/sanitizers/issues/1816
+  KeepUnblocked(newset, currentset, SIGSEGV);
+  KeepUnblocked(newset, currentset, SIGBUS);
+  KeepUnblocked(newset, currentset, SIGILL);
+  KeepUnblocked(newset, currentset, SIGTRAP);
+  KeepUnblocked(newset, currentset, SIGABRT);
+  KeepUnblocked(newset, currentset, SIGFPE);
+  KeepUnblocked(newset, currentset, SIGPIPE);
+#endif  //! SANITIZER_ANDROID
+
+#  endif  // SANITIZER_LINUX
 
-  SetSigProcMask(&set, oldset);
+  SetSigProcMask(&newset, oldset);
 }
 
 ScopedBlockSignals::ScopedBlockSignals(__sanitizer_sigset_t *copy) {
diff --git a/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt 
b/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
index 2b4c15125263a9..fef8bb772e0e0d 100644
--- a/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
@@ -15,6 +15,7 @@ set(SANITIZE

[llvm-branch-commits] [compiler-rt] release/19.x: Reapply "[sanitizer_common] AND signals in BlockSignals instead of deleting (#113443)" for non-Android Linux only (#115790) (PR #116670)

2024-11-18 Thread Thurston Dang via llvm-branch-commits

https://github.com/thurstond approved this pull request.


https://github.com/llvm/llvm-project/pull/116670
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] AMDGPU: Add first gfx950 mfma instructions (PR #116312)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/116312

>From 56e2ba8ee3266bdef464e456e06e67b45f946ef0 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 21 Nov 2023 10:03:19 +0900
Subject: [PATCH] AMDGPU: Add first gfx950 mfma instructions

Scheduling info and hazards are wrong and TBD.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   6 +
 .../CodeGenOpenCL/builtins-amdgcn-mfma.cl |  25 +-
 .../builtins-amdgcn-error-gfx950-param.cl |  21 ++
 .../builtins-amdgcn-error-gfx950.cl   |  12 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   9 +
 llvm/lib/Target/AMDGPU/AMDGPU.td  |   4 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   4 +-
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   2 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |   4 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   4 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  22 ++
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |  17 ++
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 274 ++
 llvm/test/MC/AMDGPU/mai-gfx950.s  | 112 +++
 .../MC/Disassembler/AMDGPU/gfx950_mai.txt |  61 
 llvm/test/tools/llvm-mca/AMDGPU/gfx950.s  |  18 ++
 16 files changed, 592 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
 create mode 100644 llvm/test/MC/AMDGPU/mai-gfx950.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
 create mode 100644 llvm/test/tools/llvm-mca/AMDGPU/gfx950.s

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 61516eb2a4a723..6917d8d1aca69d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -431,6 +431,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", 
"nc", "fp8-conversion-
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", 
"fp8-conversion-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", 
"fp8-conversion-insts")
 
+//===--===//
+// GFX950 only builtins.
+//===--===//
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", 
"nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", 
"nc", "gfx950-insts")
+
 
//===--===//
 // GFX12+ only builtins.
 
//===--===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index dcdeee6b6acc40..a644a60f9ec381 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -2,6 +2,7 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 
-DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX908
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a 
-DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX90A
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 
-DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX940
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 
-DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX950
 
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 
@@ -222,7 +223,7 @@ void test_mfma_f64_4x4x4f64(global double* out, double a, 
double b, double c)
 
 #endif // MFMA_GFX90A_TESTS
 
-#ifdef MFMA_GFX940_TESTS
+#if defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
 // CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8
 // CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 
%b, <4 x i32> %c, i32 0, i32 0, i32 0)
 void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c)
@@ -404,4 +405,24 @@ void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, 
v2i a, v4i b, v16f c, in
 {
   *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0);
 }
-#endif // MFMA_GFX940_TESTS
+#endif // defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
+
+#ifdef MFMA_GFX950_TESTS
+
+// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_f16(
+// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x 
half> %a, <8 x half> %b, <4 x float> %c, i32 1, i32 2, i32 3)
+
+v4f test_mfma_f32_16x16x32_f16(v8h a, v8h b, v4f c)
+{
+  return __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 1, 2, 3);
+}
+
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_f16
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32

[llvm-branch-commits] [llvm] AMDGPU: Increase the LDS size to support to 160 KB for gfx950 (PR #116309)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/116309

>From 74ed0a510ff829e5e98d9edf0284ee4decfa4bc0 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap 
Date: Wed, 13 Dec 2023 00:27:03 -0500
Subject: [PATCH 1/2] AMDGPU: Increase the LDS size to support to 160 KB for
 gfx950

---
 llvm/docs/AMDGPUUsage.rst |  2 +
 llvm/lib/Target/AMDGPU/AMDGPU.td  |  3 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   | 12 +++--
 llvm/lib/Target/AMDGPU/AMDGPUFeatures.td  |  1 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp|  2 +
 llvm/test/CodeGen/AMDGPU/extra-lds-size.ll|  7 +++
 .../AMDGPU/lds-limit-diagnostics-gfx950.ll| 13 +
 .../CodeGen/AMDGPU/lds-size-hsa-gfx950.ll | 31 +++
 .../CodeGen/AMDGPU/lds-size-pal-gfx950.ll | 26 ++
 .../tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s | 52 +++
 10 files changed, 144 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
 create mode 100644 llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index b85b680b9c82d3..a25b6feddbeddc 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5475,6 +5475,8 @@ The fields used by CP for code objects before V3 also 
match those specified in
roundup(lds-size / (64 
* 4))
  GFX7-GFX11
roundup(lds-size / (128 
* 4))
+ GFX950
+   roundup(lds-size / (320 
* 4))
 
  24  1 bit   ENABLE_EXCEPTION_IEEE_754_FPWavefront starts execution
  _INVALID_OPERATION  with specified exceptions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 35dbf86b7c6f36..a05d4a644d08d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1494,7 +1494,8 @@ def FeatureISAVersion9_5_Common : FeatureSet<
   [FeatureFP8Insts,
FeatureFP8ConversionInsts,
FeatureCvtFP8VOP1Bug,
-   FeatureGFX950Insts
+   FeatureGFX950Insts,
+   FeatureAddressableLocalMemorySize163840
])>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d801f2b1591275..90ece275412c7c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1172,12 +1172,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo 
&ProgInfo,
   ProgInfo.DX10Clamp = Mode.DX10Clamp;
 
   unsigned LDSAlignShift;
-  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
-// LDS is allocated in 64 dword blocks.
-LDSAlignShift = 8;
-  } else {
+  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
+// LDS is allocated in 320 dword blocks.
+LDSAlignShift = 11;
+  } else if (STM.getFeatureBits().test(
+ FeatureAddressableLocalMemorySize65536)) {
 // LDS is allocated in 128 dword blocks.
 LDSAlignShift = 9;
+  } else {
+// LDS is allocated in 64 dword blocks.
+LDSAlignShift = 8;
   }
 
   ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td 
b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index f832a2a55d6229..74d1faeb6f545b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -29,6 +29,7 @@ class SubtargetFeatureAddressableLocalMemorySize  
: SubtargetFeature<
 
 def FeatureAddressableLocalMemorySize32768 : 
SubtargetFeatureAddressableLocalMemorySize<32768>;
 def FeatureAddressableLocalMemorySize65536 : 
SubtargetFeatureAddressableLocalMemorySize<65536>;
+def FeatureAddressableLocalMemorySize163840 : 
SubtargetFeatureAddressableLocalMemorySize<163840>;
 
 class SubtargetFeatureWavefrontSize  : SubtargetFeature<
   "wavefrontsize"#!shl(1, ValueLog2),
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp 
b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 01866fbd9da6e7..501d00b1f308d9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -916,6 +916,8 @@ unsigned getAddressableLocalMemorySize(const 
MCSubtargetInfo *STI) {
 return 32768;
   if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
 return 65536;
+  if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
+return 163840;
   return 0;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll 
b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index 13640b74a7937b..318ecd16a2cc

[llvm-branch-commits] [flang] [llvm] [flang][OpenMP] Change clause modifier representation in parser (PR #116656)

2024-11-18 Thread Krzysztof Parzyszek via llvm-branch-commits

https://github.com/kparzysz created 
https://github.com/llvm/llvm-project/pull/116656

The main issue to solve is that OpenMP modifiers can be specified in any order, 
so the parser cannot expect any specific modifier at a given position. To solve 
that, define modifier to be a union of all allowable specific modifiers for a 
given clause.

Additionally, implement modifier descriptors: for each modifier the 
corresponding descriptor contains a set of properties of the modifier that 
allow a common set of semantic checks. Start with the syntactic properties 
defined in the spec: Required, Unique, Exclusive, Ultimate, and implement 
common checks to verify each of them.

OpenMP modifier overhaul: #2/3

>From e8bbc26e136993758c3a3197eed6b1924c6531d0 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek 
Date: Mon, 18 Nov 2024 08:47:24 -0600
Subject: [PATCH] [flang][OpenMP] Change clause modifier representation in
 parser

The main issue to solve is that OpenMP modifiers can be specified
in any order, so the parser cannot expect any specific modifier at
a given position. To solve that, define modifier to be a union of
all allowable specific modifiers for a given clause.

Additionally, implement modifier descriptors: for each modifier the
corresponding descriptor contains a set of properties of the modifier
that allow a common set of semantic checks. Start with the syntactic
properties defined in the spec: Required, Unique, Exclusive, Ultimate,
and implement common checks to verify each of them.

OpenMP modifier overhaul: #2/3
---
 .../flang/Semantics/openmp-modifiers.h| 391 ++
 flang/lib/Semantics/CMakeLists.txt|   1 +
 flang/lib/Semantics/openmp-modifiers.cpp  | 146 +++
 llvm/include/llvm/Frontend/OpenMP/OMP.h   |   2 +
 llvm/lib/Frontend/OpenMP/OMP.cpp  |   5 +
 5 files changed, 545 insertions(+)
 create mode 100644 flang/include/flang/Semantics/openmp-modifiers.h
 create mode 100644 flang/lib/Semantics/openmp-modifiers.cpp

diff --git a/flang/include/flang/Semantics/openmp-modifiers.h 
b/flang/include/flang/Semantics/openmp-modifiers.h
new file mode 100644
index 00..6be582761ed687
--- /dev/null
+++ b/flang/include/flang/Semantics/openmp-modifiers.h
@@ -0,0 +1,391 @@
+//===-- flang/lib/Semantics/openmp-modifiers.h --*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef FORTRAN_SEMANTICS_OPENMP_MODIFIERS_H_
+#define FORTRAN_SEMANTICS_OPENMP_MODIFIERS_H_
+
+#include "flang/Common/enum-set.h"
+#include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/semantics.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
+
+#include 
+#include 
+#include 
+#include 
+
+namespace Fortran::semantics {
+
+// Ref: [5.2:58]
+//
+// Syntactic properties for Clauses, Arguments and Modifiers
+//
+// Inverse properties:
+//   not Required  -> Optional
+//   not Unique-> Repeatable
+//   not Exclusive -> Compatible
+//   not Ultimate  -> Free
+//
+// Clause defaults:   Optional, Repeatable, Compatible, Free
+// Argument defaults: Required, Unique, Compatible, Free
+// Modifier defaults: Optional, Unique, Compatible, Free
+//
+// ---
+// Each modifier is used as either pre-modifier (i.e. modifier: item),
+// or post-modifier (i.e. item: modifier). The default is pre-.
+// Add an additional property that reflects the type of modifier.
+
+ENUM_CLASS(OmpProperty, Required, Unique, Exclusive, Ultimate, Post);
+using OmpProperties = common::EnumSet;
+using OmpClauses =
+common::EnumSet;
+
+struct OmpModifierDescriptor {
+  // Modifier name for use in diagnostic messages.
+  const OmpProperties &props(unsigned version) const;
+  const OmpClauses &clauses(unsigned version) const;
+
+  const llvm::StringRef name;
+  // Version-dependent properties of the modifier.
+  const std::map props_;
+  // Version-dependent set of clauses to which the modifier can apply.
+  const std::map clauses_;
+};
+
+template  const OmpModifierDescriptor &OmpGetDescriptor();
+
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor();
+template <>
+const OmpModifierDescriptor 
&OmpGetDescriptor();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor();
+
+// Explanation of terminology:
+//
+// A typical clause with modifier[s] looks like this (with parts that are
+// not relevant here removed):
+//   struct OmpSomeClause {
+// struct Modifier {
+//   using Variant = std::variant;
+//   Variant u;
+// };
+// std::tuple>, ...> t;
+//   };
+//
+// The Speficic1, etc. refer to parser cla

[llvm-branch-commits] [flang] [flang][OpenMP] Apply modifier representation to semantic checks (PR #116658)

2024-11-18 Thread Krzysztof Parzyszek via llvm-branch-commits

https://github.com/kparzysz created 
https://github.com/llvm/llvm-project/pull/116658

Also, define helper macros in parse-tree.h.

Apply the new modifier representation to the DEFAULTMAP and REDUCTION clauses, 
with testcases utilizing the new modifier validation.

OpenMP modifier overhaul: #3/3

>From fac6a8594643811418f37ee42fc1ac35bcc2a244 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek 
Date: Thu, 14 Nov 2024 07:29:59 -0600
Subject: [PATCH] [flang][OpenMP] Apply modifier representation to semantic
 checks

Also, define helper macros in parse-tree.h.

Apply the new modifier representation to the DEFAULTMAP and REDUCTION
clauses, with testcases utilizing the new modifier validation.

OpenMP modifier overhaul: #3/3
---
 flang/include/flang/Parser/dump-parse-tree.h  |  8 +-
 flang/include/flang/Parser/parse-tree.h   | 49 +--
 .../flang/Semantics/openmp-modifiers.h|  4 +
 flang/lib/Lower/OpenMP/Clauses.cpp| 33 
 flang/lib/Parser/openmp-parsers.cpp   | 40 +
 flang/lib/Parser/unparse.cpp  | 15 ++--
 flang/lib/Semantics/check-omp-structure.cpp   | 83 +++
 flang/lib/Semantics/check-omp-structure.h |  3 +-
 flang/lib/Semantics/openmp-modifiers.cpp  | 33 
 flang/lib/Semantics/resolve-directives.cpp| 52 +++-
 .../test/Parser/OpenMP/defaultmap-clause.f90  |  8 +-
 .../test/Parser/OpenMP/defaultmap-unparse.f90 | 16 ++--
 .../test/Parser/OpenMP/reduction-modifier.f90 |  6 +-
 .../Semantics/OpenMP/combined-constructs.f90  | 12 +--
 .../OpenMP/defaultmap-clause-v45.f90  |  2 +-
 15 files changed, 236 insertions(+), 128 deletions(-)

diff --git a/flang/include/flang/Parser/dump-parse-tree.h 
b/flang/include/flang/Parser/dump-parse-tree.h
index df5bf1d8d3200e..9c59ce520a31aa 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -509,9 +509,11 @@ class ParseTreeDumper {
   NODE(parser, OmpDeclareMapperSpecifier)
   NODE(parser, OmpDefaultClause)
   NODE_ENUM(OmpDefaultClause, Type)
+  NODE(parser, OmpVariableCategory)
+  NODE_ENUM(OmpVariableCategory, Value)
   NODE(parser, OmpDefaultmapClause)
   NODE_ENUM(OmpDefaultmapClause, ImplicitBehavior)
-  NODE_ENUM(OmpDefaultmapClause, VariableCategory)
+  NODE(OmpDefaultmapClause, Modifier)
   NODE(parser, OmpDependenceType)
   NODE_ENUM(OmpDependenceType, Value)
   NODE(parser, OmpTaskDependenceType)
@@ -567,8 +569,10 @@ class ParseTreeDumper {
   NODE_ENUM(OmpBindClause, Type)
   NODE(parser, OmpProcBindClause)
   NODE_ENUM(OmpProcBindClause, Type)
-  NODE_ENUM(OmpReductionClause, ReductionModifier)
+  NODE(parser, OmpReductionModifier)
+  NODE_ENUM(OmpReductionModifier, Value)
   NODE(parser, OmpReductionClause)
+  NODE(OmpReductionClause, Modifier)
   NODE(parser, OmpInReductionClause)
   NODE(parser, OmpReductionCombiner)
   NODE(OmpReductionCombiner, FunctionCombiner)
diff --git a/flang/include/flang/Parser/parse-tree.h 
b/flang/include/flang/Parser/parse-tree.h
index ef49a36578270e..5b28bcd4e21b80 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3440,6 +3440,16 @@ struct OmpObject {
 
 WRAPPER_CLASS(OmpObjectList, std::list);
 
+#define MODIFIER_BOILERPLATE(...) \
+  struct Modifier { \
+using Variant = std::variant<__VA_ARGS__>; \
+UNION_CLASS_BOILERPLATE(Modifier); \
+CharBlock source; \
+Variant u; \
+  }
+
+#define MODIFIERS() std::optional>
+
 inline namespace modifier {
 // For uniformity, in all keyword modifiers the name of the type defined
 // by ENUM_CLASS is "Value", e.g.
@@ -3505,12 +3515,20 @@ struct OmpLinearModifier {
 //   - |// since 4.5, until 5.2
 //   + | * | .AND. | .OR. | .EQV. | .NEQV. |// since 4.5
 //   MIN | MAX | IAND | IOR | IEOR  // since 4.5
-//
 struct OmpReductionIdentifier {
   UNION_CLASS_BOILERPLATE(OmpReductionIdentifier);
   std::variant u;
 };
 
+// Ref: [5.0:300-302], [5.1:332-334], [5.2:134-137]
+//
+// reduction-modifier ->
+//   DEFAULT | INSCAN | TASK// since 5.0
+struct OmpReductionModifier {
+  ENUM_CLASS(Value, Default, Inscan, Task);
+  WRAPPER_CLASS_BOILERPLATE(OmpReductionModifier, Value);
+};
+
 // Ref: [4.5:169-170], [5.0:254-256], [5.1:287-289], [5.2:321]
 //
 // task-dependence-type -> // "dependence-type" in 5.1 and before
@@ -3521,6 +3539,17 @@ struct OmpTaskDependenceType {
   ENUM_CLASS(Value, In, Out, Inout, Inoutset, Mutexinoutset, Depobj)
   WRAPPER_CLASS_BOILERPLATE(OmpTaskDependenceType, Value);
 };
+
+// Ref: [4.5:229-230], [5.0:324-325], [5.1:357-358], [5.2:161-162]
+//
+// variable-category ->
+//   SCALAR |   // since 4.5
+//   AGGREGATE | ALLOCATABLE | POINTER |// since 5.0
+//   ALL// since 5.2
+struct OmpVariableCategory {
+  ENUM_CLASS(Value, Aggregate, All, Allocatable

[llvm-branch-commits] [flang] [llvm] [flang][OpenMP] Change clause modifier representation in parser (PR #116656)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-flang-semantics

Author: Krzysztof Parzyszek (kparzysz)


Changes

The main issue to solve is that OpenMP modifiers can be specified in any order, 
so the parser cannot expect any specific modifier at a given position. To solve 
that, define modifier to be a union of all allowable specific modifiers for a 
given clause.

Additionally, implement modifier descriptors: for each modifier the 
corresponding descriptor contains a set of properties of the modifier that 
allow a common set of semantic checks. Start with the syntactic properties 
defined in the spec: Required, Unique, Exclusive, Ultimate, and implement 
common checks to verify each of them.

OpenMP modifier overhaul: #2/3

---

Patch is 21.10 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/116656.diff


5 Files Affected:

- (added) flang/include/flang/Semantics/openmp-modifiers.h (+391) 
- (modified) flang/lib/Semantics/CMakeLists.txt (+1) 
- (added) flang/lib/Semantics/openmp-modifiers.cpp (+146) 
- (modified) llvm/include/llvm/Frontend/OpenMP/OMP.h (+2) 
- (modified) llvm/lib/Frontend/OpenMP/OMP.cpp (+5) 


``diff
diff --git a/flang/include/flang/Semantics/openmp-modifiers.h 
b/flang/include/flang/Semantics/openmp-modifiers.h
new file mode 100644
index 00..6be582761ed687
--- /dev/null
+++ b/flang/include/flang/Semantics/openmp-modifiers.h
@@ -0,0 +1,391 @@
+//===-- flang/lib/Semantics/openmp-modifiers.h --*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef FORTRAN_SEMANTICS_OPENMP_MODIFIERS_H_
+#define FORTRAN_SEMANTICS_OPENMP_MODIFIERS_H_
+
+#include "flang/Common/enum-set.h"
+#include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/semantics.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
+
+#include 
+#include 
+#include 
+#include 
+
+namespace Fortran::semantics {
+
+// Ref: [5.2:58]
+//
+// Syntactic properties for Clauses, Arguments and Modifiers
+//
+// Inverse properties:
+//   not Required  -> Optional
+//   not Unique-> Repeatable
+//   not Exclusive -> Compatible
+//   not Ultimate  -> Free
+//
+// Clause defaults:   Optional, Repeatable, Compatible, Free
+// Argument defaults: Required, Unique, Compatible, Free
+// Modifier defaults: Optional, Unique, Compatible, Free
+//
+// ---
+// Each modifier is used as either pre-modifier (i.e. modifier: item),
+// or post-modifier (i.e. item: modifier). The default is pre-.
+// Add an additional property that reflects the type of modifier.
+
+ENUM_CLASS(OmpProperty, Required, Unique, Exclusive, Ultimate, Post);
+using OmpProperties = common::EnumSet;
+using OmpClauses =
+common::EnumSet;
+
+struct OmpModifierDescriptor {
+  // Modifier name for use in diagnostic messages.
+  const OmpProperties &props(unsigned version) const;
+  const OmpClauses &clauses(unsigned version) const;
+
+  const llvm::StringRef name;
+  // Version-dependent properties of the modifier.
+  const std::map props_;
+  // Version-dependent set of clauses to which the modifier can apply.
+  const std::map clauses_;
+};
+
+template  const OmpModifierDescriptor &OmpGetDescriptor();
+
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor();
+template <>
+const OmpModifierDescriptor 
&OmpGetDescriptor();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor();
+
+// Explanation of terminology:
+//
+// A typical clause with modifier[s] looks like this (with parts that are
+// not relevant here removed):
+//   struct OmpSomeClause {
+// struct Modifier {
+//   using Variant = std::variant;
+//   Variant u;
+// };
+// std::tuple>, ...> t;
+//   };
+//
+// The Speficic1, etc. refer to parser classes that represent modifiers,
+// e.g. OmpIterator or OmpTaskDependenceType. The Variant type contains
+// all modifiers that are allowed for a given clause. The Modifier class
+// is there to wrap the variant into the form that the parse tree visitor
+// expects, i.e. with traits, member "u", etc.
+//
+// To avoid ambiguities with the word "modifier" (e.g. is it "any modifier",
+// or "this specific modifier"?), the following code uses different terms:
+//
+// - UnionTy:refers to the nested "Modifier" class, i.e.
+//   "OmpSomeClause::Modifier" in the example above.
+// - SpecificTy: refers to any of the alternatives, i.e. "Specific1" or
+//   "Specific2".
+
+template 
+const OmpModifierDescriptor &OmpGetDescriptor(const UnionTy &modifier) {
+  return common::visit(
+  [](auto &&m) -> d

[llvm-branch-commits] [flang] [flang][OpenMP] Apply modifier representation to semantic checks (PR #116658)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-flang-openmp

Author: Krzysztof Parzyszek (kparzysz)


Changes

Also, define helper macros in parse-tree.h.

Apply the new modifier representation to the DEFAULTMAP and REDUCTION clauses, 
with testcases utilizing the new modifier validation.

OpenMP modifier overhaul: #3/3

---

Patch is 37.74 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/116658.diff


15 Files Affected:

- (modified) flang/include/flang/Parser/dump-parse-tree.h (+6-2) 
- (modified) flang/include/flang/Parser/parse-tree.h (+40-9) 
- (modified) flang/include/flang/Semantics/openmp-modifiers.h (+4) 
- (modified) flang/lib/Lower/OpenMP/Clauses.cpp (+19-14) 
- (modified) flang/lib/Parser/openmp-parsers.cpp (+24-16) 
- (modified) flang/lib/Parser/unparse.cpp (+6-9) 
- (modified) flang/lib/Semantics/check-omp-structure.cpp (+49-34) 
- (modified) flang/lib/Semantics/check-omp-structure.h (+1-2) 
- (modified) flang/lib/Semantics/openmp-modifiers.cpp (+33) 
- (modified) flang/lib/Semantics/resolve-directives.cpp (+32-20) 
- (modified) flang/test/Parser/OpenMP/defaultmap-clause.f90 (+4-4) 
- (modified) flang/test/Parser/OpenMP/defaultmap-unparse.f90 (+8-8) 
- (modified) flang/test/Parser/OpenMP/reduction-modifier.f90 (+3-3) 
- (modified) flang/test/Semantics/OpenMP/combined-constructs.f90 (+6-6) 
- (modified) flang/test/Semantics/OpenMP/defaultmap-clause-v45.f90 (+1-1) 


``diff
diff --git a/flang/include/flang/Parser/dump-parse-tree.h 
b/flang/include/flang/Parser/dump-parse-tree.h
index df5bf1d8d3200e..9c59ce520a31aa 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -509,9 +509,11 @@ class ParseTreeDumper {
   NODE(parser, OmpDeclareMapperSpecifier)
   NODE(parser, OmpDefaultClause)
   NODE_ENUM(OmpDefaultClause, Type)
+  NODE(parser, OmpVariableCategory)
+  NODE_ENUM(OmpVariableCategory, Value)
   NODE(parser, OmpDefaultmapClause)
   NODE_ENUM(OmpDefaultmapClause, ImplicitBehavior)
-  NODE_ENUM(OmpDefaultmapClause, VariableCategory)
+  NODE(OmpDefaultmapClause, Modifier)
   NODE(parser, OmpDependenceType)
   NODE_ENUM(OmpDependenceType, Value)
   NODE(parser, OmpTaskDependenceType)
@@ -567,8 +569,10 @@ class ParseTreeDumper {
   NODE_ENUM(OmpBindClause, Type)
   NODE(parser, OmpProcBindClause)
   NODE_ENUM(OmpProcBindClause, Type)
-  NODE_ENUM(OmpReductionClause, ReductionModifier)
+  NODE(parser, OmpReductionModifier)
+  NODE_ENUM(OmpReductionModifier, Value)
   NODE(parser, OmpReductionClause)
+  NODE(OmpReductionClause, Modifier)
   NODE(parser, OmpInReductionClause)
   NODE(parser, OmpReductionCombiner)
   NODE(OmpReductionCombiner, FunctionCombiner)
diff --git a/flang/include/flang/Parser/parse-tree.h 
b/flang/include/flang/Parser/parse-tree.h
index ef49a36578270e..5b28bcd4e21b80 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3440,6 +3440,16 @@ struct OmpObject {
 
 WRAPPER_CLASS(OmpObjectList, std::list);
 
+#define MODIFIER_BOILERPLATE(...) \
+  struct Modifier { \
+using Variant = std::variant<__VA_ARGS__>; \
+UNION_CLASS_BOILERPLATE(Modifier); \
+CharBlock source; \
+Variant u; \
+  }
+
+#define MODIFIERS() std::optional>
+
 inline namespace modifier {
 // For uniformity, in all keyword modifiers the name of the type defined
 // by ENUM_CLASS is "Value", e.g.
@@ -3505,12 +3515,20 @@ struct OmpLinearModifier {
 //   - |// since 4.5, until 5.2
 //   + | * | .AND. | .OR. | .EQV. | .NEQV. |// since 4.5
 //   MIN | MAX | IAND | IOR | IEOR  // since 4.5
-//
 struct OmpReductionIdentifier {
   UNION_CLASS_BOILERPLATE(OmpReductionIdentifier);
   std::variant u;
 };
 
+// Ref: [5.0:300-302], [5.1:332-334], [5.2:134-137]
+//
+// reduction-modifier ->
+//   DEFAULT | INSCAN | TASK// since 5.0
+struct OmpReductionModifier {
+  ENUM_CLASS(Value, Default, Inscan, Task);
+  WRAPPER_CLASS_BOILERPLATE(OmpReductionModifier, Value);
+};
+
 // Ref: [4.5:169-170], [5.0:254-256], [5.1:287-289], [5.2:321]
 //
 // task-dependence-type -> // "dependence-type" in 5.1 and before
@@ -3521,6 +3539,17 @@ struct OmpTaskDependenceType {
   ENUM_CLASS(Value, In, Out, Inout, Inoutset, Mutexinoutset, Depobj)
   WRAPPER_CLASS_BOILERPLATE(OmpTaskDependenceType, Value);
 };
+
+// Ref: [4.5:229-230], [5.0:324-325], [5.1:357-358], [5.2:161-162]
+//
+// variable-category ->
+//   SCALAR |   // since 4.5
+//   AGGREGATE | ALLOCATABLE | POINTER |// since 5.0
+//   ALL// since 5.2
+struct OmpVariableCategory {
+  ENUM_CLASS(Value, Aggregate, All, Allocatable, Pointer, Scalar)
+  WRAPPER_CLASS_BOILERPLATE(OmpVariableCategory, Value);
+};
 } // namespace modifier
 
 // --- Clauses
@@ -3578,8 +3607,8 @@ struct OmpDefaultmapClause {
   TUPLE_CLASS_BOILERPLATE(OmpDefaultmapClause

[llvm-branch-commits] [flang] [flang][OpenMP] Apply modifier representation to semantic checks (PR #116658)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-flang-parser

Author: Krzysztof Parzyszek (kparzysz)


Changes

Also, define helper macros in parse-tree.h.

Apply the new modifier representation to the DEFAULTMAP and REDUCTION clauses, 
with testcases utilizing the new modifier validation.

OpenMP modifier overhaul: #3/3

---

Patch is 37.74 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/116658.diff


15 Files Affected:

- (modified) flang/include/flang/Parser/dump-parse-tree.h (+6-2) 
- (modified) flang/include/flang/Parser/parse-tree.h (+40-9) 
- (modified) flang/include/flang/Semantics/openmp-modifiers.h (+4) 
- (modified) flang/lib/Lower/OpenMP/Clauses.cpp (+19-14) 
- (modified) flang/lib/Parser/openmp-parsers.cpp (+24-16) 
- (modified) flang/lib/Parser/unparse.cpp (+6-9) 
- (modified) flang/lib/Semantics/check-omp-structure.cpp (+49-34) 
- (modified) flang/lib/Semantics/check-omp-structure.h (+1-2) 
- (modified) flang/lib/Semantics/openmp-modifiers.cpp (+33) 
- (modified) flang/lib/Semantics/resolve-directives.cpp (+32-20) 
- (modified) flang/test/Parser/OpenMP/defaultmap-clause.f90 (+4-4) 
- (modified) flang/test/Parser/OpenMP/defaultmap-unparse.f90 (+8-8) 
- (modified) flang/test/Parser/OpenMP/reduction-modifier.f90 (+3-3) 
- (modified) flang/test/Semantics/OpenMP/combined-constructs.f90 (+6-6) 
- (modified) flang/test/Semantics/OpenMP/defaultmap-clause-v45.f90 (+1-1) 


``diff
diff --git a/flang/include/flang/Parser/dump-parse-tree.h 
b/flang/include/flang/Parser/dump-parse-tree.h
index df5bf1d8d3200e..9c59ce520a31aa 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -509,9 +509,11 @@ class ParseTreeDumper {
   NODE(parser, OmpDeclareMapperSpecifier)
   NODE(parser, OmpDefaultClause)
   NODE_ENUM(OmpDefaultClause, Type)
+  NODE(parser, OmpVariableCategory)
+  NODE_ENUM(OmpVariableCategory, Value)
   NODE(parser, OmpDefaultmapClause)
   NODE_ENUM(OmpDefaultmapClause, ImplicitBehavior)
-  NODE_ENUM(OmpDefaultmapClause, VariableCategory)
+  NODE(OmpDefaultmapClause, Modifier)
   NODE(parser, OmpDependenceType)
   NODE_ENUM(OmpDependenceType, Value)
   NODE(parser, OmpTaskDependenceType)
@@ -567,8 +569,10 @@ class ParseTreeDumper {
   NODE_ENUM(OmpBindClause, Type)
   NODE(parser, OmpProcBindClause)
   NODE_ENUM(OmpProcBindClause, Type)
-  NODE_ENUM(OmpReductionClause, ReductionModifier)
+  NODE(parser, OmpReductionModifier)
+  NODE_ENUM(OmpReductionModifier, Value)
   NODE(parser, OmpReductionClause)
+  NODE(OmpReductionClause, Modifier)
   NODE(parser, OmpInReductionClause)
   NODE(parser, OmpReductionCombiner)
   NODE(OmpReductionCombiner, FunctionCombiner)
diff --git a/flang/include/flang/Parser/parse-tree.h 
b/flang/include/flang/Parser/parse-tree.h
index ef49a36578270e..5b28bcd4e21b80 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3440,6 +3440,16 @@ struct OmpObject {
 
 WRAPPER_CLASS(OmpObjectList, std::list);
 
+#define MODIFIER_BOILERPLATE(...) \
+  struct Modifier { \
+using Variant = std::variant<__VA_ARGS__>; \
+UNION_CLASS_BOILERPLATE(Modifier); \
+CharBlock source; \
+Variant u; \
+  }
+
+#define MODIFIERS() std::optional>
+
 inline namespace modifier {
 // For uniformity, in all keyword modifiers the name of the type defined
 // by ENUM_CLASS is "Value", e.g.
@@ -3505,12 +3515,20 @@ struct OmpLinearModifier {
 //   - |// since 4.5, until 5.2
 //   + | * | .AND. | .OR. | .EQV. | .NEQV. |// since 4.5
 //   MIN | MAX | IAND | IOR | IEOR  // since 4.5
-//
 struct OmpReductionIdentifier {
   UNION_CLASS_BOILERPLATE(OmpReductionIdentifier);
   std::variant u;
 };
 
+// Ref: [5.0:300-302], [5.1:332-334], [5.2:134-137]
+//
+// reduction-modifier ->
+//   DEFAULT | INSCAN | TASK// since 5.0
+struct OmpReductionModifier {
+  ENUM_CLASS(Value, Default, Inscan, Task);
+  WRAPPER_CLASS_BOILERPLATE(OmpReductionModifier, Value);
+};
+
 // Ref: [4.5:169-170], [5.0:254-256], [5.1:287-289], [5.2:321]
 //
 // task-dependence-type -> // "dependence-type" in 5.1 and before
@@ -3521,6 +3539,17 @@ struct OmpTaskDependenceType {
   ENUM_CLASS(Value, In, Out, Inout, Inoutset, Mutexinoutset, Depobj)
   WRAPPER_CLASS_BOILERPLATE(OmpTaskDependenceType, Value);
 };
+
+// Ref: [4.5:229-230], [5.0:324-325], [5.1:357-358], [5.2:161-162]
+//
+// variable-category ->
+//   SCALAR |   // since 4.5
+//   AGGREGATE | ALLOCATABLE | POINTER |// since 5.0
+//   ALL// since 5.2
+struct OmpVariableCategory {
+  ENUM_CLASS(Value, Aggregate, All, Allocatable, Pointer, Scalar)
+  WRAPPER_CLASS_BOILERPLATE(OmpVariableCategory, Value);
+};
 } // namespace modifier
 
 // --- Clauses
@@ -3578,8 +3607,8 @@ struct OmpDefaultmapClause {
   TUPLE_CLASS_BOILERPLATE(OmpDefaultmapClause

[llvm-branch-commits] [llvm] [Linker] Remove a use of StructType::setBody. NFC. (PR #116653)

2024-11-18 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-lto

Author: Jay Foad (jayfoad)


Changes

This falls out naturally after inlining finishType into its only remaining use.

---
Full diff: https://github.com/llvm/llvm-project/pull/116653.diff


1 Files Affected:

- (modified) llvm/lib/Linker/IRMover.cpp (+11-18) 


``diff
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index c653900c632cc9..4bb0ddf891744b 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -82,8 +82,6 @@ class TypeMapTy : public ValueMapTypeRemapper {
   Type *get(Type *SrcTy);
   Type *get(Type *SrcTy, SmallPtrSet &Visited);
 
-  void finishType(StructType *DTy, StructType *STy, ArrayRef ETypes);
-
   FunctionType *get(FunctionType *T) {
 return cast(get((Type *)T));
   }
@@ -233,20 +231,6 @@ Error TypeMapTy::linkDefinedTypeBodies() {
   return Error::success();
 }
 
-void TypeMapTy::finishType(StructType *DTy, StructType *STy,
-   ArrayRef ETypes) {
-  DTy->setBody(ETypes, STy->isPacked());
-
-  // Steal STy's name.
-  if (STy->hasName()) {
-SmallString<16> TmpName = STy->getName();
-STy->setName("");
-DTy->setName(TmpName);
-  }
-
-  DstStructTypesSet.addNonOpaque(DTy);
-}
-
 Type *TypeMapTy::get(Type *Ty) {
   SmallPtrSet Visited;
   return get(Ty, Visited);
@@ -342,8 +326,17 @@ Type *TypeMapTy::get(Type *Ty, SmallPtrSet &Visited) {
   return *Entry = Ty;
 }
 
-StructType *DTy = StructType::create(Ty->getContext());
-finishType(DTy, STy, ElementTypes);
+StructType *DTy =
+StructType::create(Ty->getContext(), ElementTypes, "", 
STy->isPacked());
+
+// Steal STy's name.
+if (STy->hasName()) {
+  SmallString<16> TmpName = STy->getName();
+  STy->setName("");
+  DTy->setName(TmpName);
+}
+
+DstStructTypesSet.addNonOpaque(DTy);
 return *Entry = DTy;
   }
   }

``




https://github.com/llvm/llvm-project/pull/116653
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [flang][OpenMP] Apply modifier representation to semantic checks (PR #116658)

2024-11-18 Thread via llvm-branch-commits

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:



You can test this locally with the following command:


``bash
git-clang-format --diff e8bbc26e136993758c3a3197eed6b1924c6531d0 
fac6a8594643811418f37ee42fc1ac35bcc2a244 --extensions h,cpp -- 
flang/include/flang/Parser/dump-parse-tree.h 
flang/include/flang/Parser/parse-tree.h 
flang/include/flang/Semantics/openmp-modifiers.h 
flang/lib/Lower/OpenMP/Clauses.cpp flang/lib/Parser/openmp-parsers.cpp 
flang/lib/Parser/unparse.cpp flang/lib/Semantics/check-omp-structure.cpp 
flang/lib/Semantics/check-omp-structure.h 
flang/lib/Semantics/openmp-modifiers.cpp 
flang/lib/Semantics/resolve-directives.cpp
``





View the diff from clang-format here.


``diff
diff --git a/flang/lib/Parser/openmp-parsers.cpp 
b/flang/lib/Parser/openmp-parsers.cpp
index 063201fc86..3ee8159682 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -255,8 +255,8 @@ TYPE_PARSER(construct(
 "POINTER" >> pure(OmpVariableCategory::Value::Pointer) ||
 "SCALAR" >> pure(OmpVariableCategory::Value::Scalar)))
 
-TYPE_PARSER(sourced(construct(
-Parser{})))
+TYPE_PARSER(sourced(
+construct(Parser{})))
 
 // --- Parsers for clauses 
 

``




https://github.com/llvm/llvm-project/pull/116658
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NFC] Use unique_ptr in SparseSet (PR #116617)

2024-11-18 Thread David Blaikie via llvm-branch-commits


@@ -205,7 +209,7 @@ class SparseSet {
 assert(Idx < Universe && "Key out of range");
 assert(Sparse != nullptr && "Invalid sparse type");
 const unsigned Stride = std::numeric_limits::max() + 1u;
-for (unsigned i = Sparse[Idx], e = size(); i < e; i += Stride) {
+for (unsigned i = Sparse.get()[Idx], e = size(); i < e; i += Stride) {

dwblaikie wrote:

If you make the `std::unique_ptr` into a 
`std::unique_ptr` then you can use [] directly without the 
`.get()` I think?

https://github.com/llvm/llvm-project/pull/116617
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NFC] Use unique_ptr in SparseSet (PR #116617)

2024-11-18 Thread David Blaikie via llvm-branch-commits

dwblaikie wrote:

Oh, and please add unit test coverage for the new move functionality.

https://github.com/llvm/llvm-project/pull/116617
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [flang][OpenMP] Apply modifier representation to semantic checks (PR #116658)

2024-11-18 Thread Krzysztof Parzyszek via llvm-branch-commits

https://github.com/kparzysz updated 
https://github.com/llvm/llvm-project/pull/116658

>From fac6a8594643811418f37ee42fc1ac35bcc2a244 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek 
Date: Thu, 14 Nov 2024 07:29:59 -0600
Subject: [PATCH 1/2] [flang][OpenMP] Apply modifier representation to semantic
 checks

Also, define helper macros in parse-tree.h.

Apply the new modifier representation to the DEFAULTMAP and REDUCTION
clauses, with testcases utilizing the new modifier validation.

OpenMP modifier overhaul: #3/3
---
 flang/include/flang/Parser/dump-parse-tree.h  |  8 +-
 flang/include/flang/Parser/parse-tree.h   | 49 +--
 .../flang/Semantics/openmp-modifiers.h|  4 +
 flang/lib/Lower/OpenMP/Clauses.cpp| 33 
 flang/lib/Parser/openmp-parsers.cpp   | 40 +
 flang/lib/Parser/unparse.cpp  | 15 ++--
 flang/lib/Semantics/check-omp-structure.cpp   | 83 +++
 flang/lib/Semantics/check-omp-structure.h |  3 +-
 flang/lib/Semantics/openmp-modifiers.cpp  | 33 
 flang/lib/Semantics/resolve-directives.cpp| 52 +++-
 .../test/Parser/OpenMP/defaultmap-clause.f90  |  8 +-
 .../test/Parser/OpenMP/defaultmap-unparse.f90 | 16 ++--
 .../test/Parser/OpenMP/reduction-modifier.f90 |  6 +-
 .../Semantics/OpenMP/combined-constructs.f90  | 12 +--
 .../OpenMP/defaultmap-clause-v45.f90  |  2 +-
 15 files changed, 236 insertions(+), 128 deletions(-)

diff --git a/flang/include/flang/Parser/dump-parse-tree.h 
b/flang/include/flang/Parser/dump-parse-tree.h
index df5bf1d8d3200e..9c59ce520a31aa 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -509,9 +509,11 @@ class ParseTreeDumper {
   NODE(parser, OmpDeclareMapperSpecifier)
   NODE(parser, OmpDefaultClause)
   NODE_ENUM(OmpDefaultClause, Type)
+  NODE(parser, OmpVariableCategory)
+  NODE_ENUM(OmpVariableCategory, Value)
   NODE(parser, OmpDefaultmapClause)
   NODE_ENUM(OmpDefaultmapClause, ImplicitBehavior)
-  NODE_ENUM(OmpDefaultmapClause, VariableCategory)
+  NODE(OmpDefaultmapClause, Modifier)
   NODE(parser, OmpDependenceType)
   NODE_ENUM(OmpDependenceType, Value)
   NODE(parser, OmpTaskDependenceType)
@@ -567,8 +569,10 @@ class ParseTreeDumper {
   NODE_ENUM(OmpBindClause, Type)
   NODE(parser, OmpProcBindClause)
   NODE_ENUM(OmpProcBindClause, Type)
-  NODE_ENUM(OmpReductionClause, ReductionModifier)
+  NODE(parser, OmpReductionModifier)
+  NODE_ENUM(OmpReductionModifier, Value)
   NODE(parser, OmpReductionClause)
+  NODE(OmpReductionClause, Modifier)
   NODE(parser, OmpInReductionClause)
   NODE(parser, OmpReductionCombiner)
   NODE(OmpReductionCombiner, FunctionCombiner)
diff --git a/flang/include/flang/Parser/parse-tree.h 
b/flang/include/flang/Parser/parse-tree.h
index ef49a36578270e..5b28bcd4e21b80 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3440,6 +3440,16 @@ struct OmpObject {
 
 WRAPPER_CLASS(OmpObjectList, std::list);
 
+#define MODIFIER_BOILERPLATE(...) \
+  struct Modifier { \
+using Variant = std::variant<__VA_ARGS__>; \
+UNION_CLASS_BOILERPLATE(Modifier); \
+CharBlock source; \
+Variant u; \
+  }
+
+#define MODIFIERS() std::optional>
+
 inline namespace modifier {
 // For uniformity, in all keyword modifiers the name of the type defined
 // by ENUM_CLASS is "Value", e.g.
@@ -3505,12 +3515,20 @@ struct OmpLinearModifier {
 //   - |// since 4.5, until 5.2
 //   + | * | .AND. | .OR. | .EQV. | .NEQV. |// since 4.5
 //   MIN | MAX | IAND | IOR | IEOR  // since 4.5
-//
 struct OmpReductionIdentifier {
   UNION_CLASS_BOILERPLATE(OmpReductionIdentifier);
   std::variant u;
 };
 
+// Ref: [5.0:300-302], [5.1:332-334], [5.2:134-137]
+//
+// reduction-modifier ->
+//   DEFAULT | INSCAN | TASK// since 5.0
+struct OmpReductionModifier {
+  ENUM_CLASS(Value, Default, Inscan, Task);
+  WRAPPER_CLASS_BOILERPLATE(OmpReductionModifier, Value);
+};
+
 // Ref: [4.5:169-170], [5.0:254-256], [5.1:287-289], [5.2:321]
 //
 // task-dependence-type -> // "dependence-type" in 5.1 and before
@@ -3521,6 +3539,17 @@ struct OmpTaskDependenceType {
   ENUM_CLASS(Value, In, Out, Inout, Inoutset, Mutexinoutset, Depobj)
   WRAPPER_CLASS_BOILERPLATE(OmpTaskDependenceType, Value);
 };
+
+// Ref: [4.5:229-230], [5.0:324-325], [5.1:357-358], [5.2:161-162]
+//
+// variable-category ->
+//   SCALAR |   // since 4.5
+//   AGGREGATE | ALLOCATABLE | POINTER |// since 5.0
+//   ALL// since 5.2
+struct OmpVariableCategory {
+  ENUM_CLASS(Value, Aggregate, All, Allocatable, Pointer, Scalar)
+  WRAPPER_CLASS_BOILERPLATE(OmpVariableCategory, Value);
+};
 } // namespace modifier
 
 // --- Clauses
@@ -3578,8 +3607,8 @@ struct OmpDefaultmapClause {
   TUPLE_CLASS_BOILERPLATE(OmpDef

[llvm-branch-commits] [llvm] [Linker] Remove a use of StructType::setBody. NFC. (PR #116653)

2024-11-18 Thread Jay Foad via llvm-branch-commits

https://github.com/jayfoad created 
https://github.com/llvm/llvm-project/pull/116653

This falls out naturally after inlining finishType into its only remaining use.

>From 4140bc772f5930807cb2ea5b4b2aa945c57b699c Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Mon, 18 Nov 2024 16:36:33 +
Subject: [PATCH] [Linker] Remove a use of StructType::setBody. NFC.

This falls out naturally after inlining finishType into its only
remaining use.
---
 llvm/lib/Linker/IRMover.cpp | 29 +++--
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index c653900c632cc9..4bb0ddf891744b 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -82,8 +82,6 @@ class TypeMapTy : public ValueMapTypeRemapper {
   Type *get(Type *SrcTy);
   Type *get(Type *SrcTy, SmallPtrSet &Visited);
 
-  void finishType(StructType *DTy, StructType *STy, ArrayRef ETypes);
-
   FunctionType *get(FunctionType *T) {
 return cast(get((Type *)T));
   }
@@ -233,20 +231,6 @@ Error TypeMapTy::linkDefinedTypeBodies() {
   return Error::success();
 }
 
-void TypeMapTy::finishType(StructType *DTy, StructType *STy,
-   ArrayRef ETypes) {
-  DTy->setBody(ETypes, STy->isPacked());
-
-  // Steal STy's name.
-  if (STy->hasName()) {
-SmallString<16> TmpName = STy->getName();
-STy->setName("");
-DTy->setName(TmpName);
-  }
-
-  DstStructTypesSet.addNonOpaque(DTy);
-}
-
 Type *TypeMapTy::get(Type *Ty) {
   SmallPtrSet Visited;
   return get(Ty, Visited);
@@ -342,8 +326,17 @@ Type *TypeMapTy::get(Type *Ty, SmallPtrSet &Visited) {
   return *Entry = Ty;
 }
 
-StructType *DTy = StructType::create(Ty->getContext());
-finishType(DTy, STy, ElementTypes);
+StructType *DTy =
+StructType::create(Ty->getContext(), ElementTypes, "", 
STy->isPacked());
+
+// Steal STy's name.
+if (STy->hasName()) {
+  SmallString<16> TmpName = STy->getName();
+  STy->setName("");
+  DTy->setName(TmpName);
+}
+
+DstStructTypesSet.addNonOpaque(DTy);
 return *Entry = DTy;
   }
   }

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] release/19.x: Reapply "[sanitizer_common] AND signals in BlockSignals instead of deleting (#113443)" for non-Android Linux only (#115790) (PR #116670)

2024-11-18 Thread via llvm-branch-commits

https://github.com/llvmbot milestoned 
https://github.com/llvm/llvm-project/pull/116670
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] release/19.x: Reapply "[sanitizer_common] AND signals in BlockSignals instead of deleting (#113443)" for non-Android Linux only (#115790) (PR #116670)

2024-11-18 Thread via llvm-branch-commits

github-actions[bot] wrote:



Thank you for submitting a Pull Request (PR) to the LLVM Project!

This PR will be automatically labeled and the relevant teams will be notified.

If you wish to, you can add reviewers by using the "Reviewers" section on this 
page.

If this is not working for you, it is probably because you do not have write 
permissions for the repository. In which case you can instead tag reviewers by 
name in a comment by using `@` followed by their GitHub username.

If you have received no comments on your PR for a week, you can request a 
review by "ping"ing the PR by adding a comment “Ping”. The common courtesy 
"ping" rate is once a week. Please remember that you are asking for valuable 
time from other developers.

If you have further questions, they may be answered by the [LLVM GitHub User 
Guide](https://llvm.org/docs/GitHub.html).

You can also ask questions in a comment on this PR, on the [LLVM 
Discord](https://discord.com/invite/xS7Z362) or on the 
[forums](https://discourse.llvm.org/).

https://github.com/llvm/llvm-project/pull/116670
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] release/19.x: Reapply "[sanitizer_common] AND signals in BlockSignals instead of deleting (#113443)" for non-Android Linux only (#115790) (PR #116670)

2024-11-18 Thread Vitaly Buka via llvm-branch-commits

https://github.com/vitalybuka approved this pull request.


https://github.com/llvm/llvm-project/pull/116670
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [RISCV] Support __builtin_cpu_is (PR #116231)

2024-11-18 Thread Craig Topper via llvm-branch-commits


@@ -22505,6 +22506,47 @@ Value 
*CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID,
   return nullptr;
 }
 
+Value *CodeGenFunction::EmitRISCVCpuIs(const CallExpr *E) {
+  const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
+  StringRef CPUStr = cast(CPUExpr)->getString();
+  return EmitRISCVCpuIs(CPUStr);
+}
+
+Value *CodeGenFunction::EmitRISCVCpuIs(StringRef CPUStr) {
+  llvm::Type *Int32Ty = Builder.getInt32Ty();
+  llvm::Type *Int64Ty = Builder.getInt64Ty();
+  llvm::StructType *StructTy = llvm::StructType::get(Int32Ty, Int64Ty, 
Int64Ty);
+  llvm::Constant *RISCVCPUModel =
+  CGM.CreateRuntimeVariable(StructTy, "__riscv_cpu_model");
+  cast(RISCVCPUModel)->setDSOLocal(true);
+
+  auto loadRISCVCPUID = [&](unsigned Index) {
+Value *Ptr = Builder.CreateStructGEP(StructTy, RISCVCPUModel, Index);
+Value *CPUID = Builder.CreateAlignedLoad(StructTy->getTypeAtIndex(Index),
+ Ptr, llvm::MaybeAlign());
+return CPUID;
+  };
+
+  const llvm::RISCV::CPUModel CPUModel = llvm::RISCV::getCPUModel(CPUStr);
+
+  // Compare mvendorid.
+  Value *VendorID = loadRISCVCPUID(0);
+  Value *Result =
+  Builder.CreateICmpEQ(VendorID, Builder.getInt32(CPUModel.MVendorID));
+
+  // Compare marchid.
+  Value *ArchID = loadRISCVCPUID(1);
+  Result = Builder.CreateAnd(
+  Result, Builder.CreateICmpEQ(ArchID, 
Builder.getInt64(CPUModel.MArchID)));
+
+  // Compare mimplid.

topperc wrote:

mimpid

https://github.com/llvm/llvm-project/pull/116231
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget features for minimum3/maximum3 instructions (PR #116308)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Nov 18, 1:34 PM EST**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/116308).


https://github.com/llvm/llvm-project/pull/116308
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] AMDGPU: Add v_prng_b32 instruction for gfx950 (PR #116310)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Nov 18, 1:34 PM EST**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/116310).


https://github.com/llvm/llvm-project/pull/116310
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds (PR #116681)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/116681

>From 884cb697a58e021372842cc674806a5228a84ef0 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Thu, 18 Jan 2024 16:18:05 +0700
Subject: [PATCH] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds

Enforcing this limit in the clang builtin will come later.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   8 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp  |  18 ++
 llvm/lib/Target/AMDGPU/BUFInstructions.td |  24 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  16 ++
 .../llvm.amdgcn.global.load.lds.gfx950.ll |   8 +
 ...m.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll | 176 
 ...mdgcn.struct.ptr.buffer.load.lds.gfx950.ll | 196 ++
 llvm/test/MC/AMDGPU/mubuf-gfx950.s|  32 +++
 llvm/test/MC/Disassembler/AMDGPU/gfx950.txt   |  19 ++
 9 files changed, 485 insertions(+), 12 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll
 create mode 100644 llvm/test/MC/AMDGPU/mubuf-gfx950.s

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f43ab50d2ea441..360af786c5160d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1674,7 +1674,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
llvm_i32_ty,   // imm offset(imm, included in bounds checking 
and swizzling)
@@ -1693,7 +1693,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,// rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
llvm_i32_ty,   // imm offset(imm, included in bounds checking 
and swizzling)
@@ -1715,7 +1715,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // vindex(VGPR)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
@@ -1735,7 +1735,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,// rsrc(SGPR)
LLVMQualPointerType<3>,// LDS base offset
-   llvm_i32_ty,   // Data byte size: 1/2/4
+   llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty,   // vindex(VGPR)
llvm_i32_ty,   // voffset(VGPR, included in bounds checking and 
swizzling)
llvm_i32_ty,   // soffset(SGPR/imm, excluded from bounds 
checking and swizzling)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a6ef0069f134bd..3522ece24f1c45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3240,6 +3240,24 @@ bool 
AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
  : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
 break;
+  case 12:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+
+Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+break;
+  case 16:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+
+Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+break;
   }
 
   Ma

[llvm-branch-commits] [flang] [llvm] [flang][OpenMP] Change clause modifier representation in parser (PR #116656)

2024-11-18 Thread Krzysztof Parzyszek via llvm-branch-commits

https://github.com/kparzysz updated 
https://github.com/llvm/llvm-project/pull/116656

>From e8bbc26e136993758c3a3197eed6b1924c6531d0 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek 
Date: Mon, 18 Nov 2024 08:47:24 -0600
Subject: [PATCH] [flang][OpenMP] Change clause modifier representation in
 parser

The main issue to solve is that OpenMP modifiers can be specified
in any order, so the parser cannot expect any specific modifier at
a given position. To solve that, define modifier to be a union of
all allowable specific modifiers for a given clause.

Additionally, implement modifier descriptors: for each modifier the
corresponding descriptor contains a set of properties of the modifier
that allow a common set of semantic checks. Start with the syntactic
properties defined in the spec: Required, Unique, Exclusive, Ultimate,
and implement common checks to verify each of them.

OpenMP modifier overhaul: #2/3
---
 .../flang/Semantics/openmp-modifiers.h| 391 ++
 flang/lib/Semantics/CMakeLists.txt|   1 +
 flang/lib/Semantics/openmp-modifiers.cpp  | 146 +++
 llvm/include/llvm/Frontend/OpenMP/OMP.h   |   2 +
 llvm/lib/Frontend/OpenMP/OMP.cpp  |   5 +
 5 files changed, 545 insertions(+)
 create mode 100644 flang/include/flang/Semantics/openmp-modifiers.h
 create mode 100644 flang/lib/Semantics/openmp-modifiers.cpp

diff --git a/flang/include/flang/Semantics/openmp-modifiers.h 
b/flang/include/flang/Semantics/openmp-modifiers.h
new file mode 100644
index 00..6be582761ed687
--- /dev/null
+++ b/flang/include/flang/Semantics/openmp-modifiers.h
@@ -0,0 +1,391 @@
+//===-- flang/lib/Semantics/openmp-modifiers.h --*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef FORTRAN_SEMANTICS_OPENMP_MODIFIERS_H_
+#define FORTRAN_SEMANTICS_OPENMP_MODIFIERS_H_
+
+#include "flang/Common/enum-set.h"
+#include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/semantics.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
+
+#include 
+#include 
+#include 
+#include 
+
+namespace Fortran::semantics {
+
+// Ref: [5.2:58]
+//
+// Syntactic properties for Clauses, Arguments and Modifiers
+//
+// Inverse properties:
+//   not Required  -> Optional
+//   not Unique-> Repeatable
+//   not Exclusive -> Compatible
+//   not Ultimate  -> Free
+//
+// Clause defaults:   Optional, Repeatable, Compatible, Free
+// Argument defaults: Required, Unique, Compatible, Free
+// Modifier defaults: Optional, Unique, Compatible, Free
+//
+// ---
+// Each modifier is used as either pre-modifier (i.e. modifier: item),
+// or post-modifier (i.e. item: modifier). The default is pre-.
+// Add an additional property that reflects the type of modifier.
+
+ENUM_CLASS(OmpProperty, Required, Unique, Exclusive, Ultimate, Post);
+using OmpProperties = common::EnumSet;
+using OmpClauses =
+common::EnumSet;
+
+struct OmpModifierDescriptor {
+  // Modifier name for use in diagnostic messages.
+  const OmpProperties &props(unsigned version) const;
+  const OmpClauses &clauses(unsigned version) const;
+
+  const llvm::StringRef name;
+  // Version-dependent properties of the modifier.
+  const std::map props_;
+  // Version-dependent set of clauses to which the modifier can apply.
+  const std::map clauses_;
+};
+
+template  const OmpModifierDescriptor &OmpGetDescriptor();
+
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor();
+template <>
+const OmpModifierDescriptor 
&OmpGetDescriptor();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor();
+
+// Explanation of terminology:
+//
+// A typical clause with modifier[s] looks like this (with parts that are
+// not relevant here removed):
+//   struct OmpSomeClause {
+// struct Modifier {
+//   using Variant = std::variant;
+//   Variant u;
+// };
+// std::tuple>, ...> t;
+//   };
+//
+// The Speficic1, etc. refer to parser classes that represent modifiers,
+// e.g. OmpIterator or OmpTaskDependenceType. The Variant type contains
+// all modifiers that are allowed for a given clause. The Modifier class
+// is there to wrap the variant into the form that the parse tree visitor
+// expects, i.e. with traits, member "u", etc.
+//
+// To avoid ambiguities with the word "modifier" (e.g. is it "any modifier",
+// or "this specific modifier"?), the following code uses different terms:
+//
+// - UnionTy:refers to the nested "Modifier" class, i.e.
+//   "OmpSomeClause::Modifier" in the example above.
+// - SpecificTy: refers to any 

[llvm-branch-commits] [llvm] AMDGPU: Increase the LDS size to support to 160 KB for gfx950 (PR #116309)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Nov 18, 1:34 PM EST**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/116309).


https://github.com/llvm/llvm-project/pull/116309
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 global_load_lds_* instructions (PR #116680)

2024-11-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/116680

>From 0443398b73f18791598db1bf6ab2274a46ac649f Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Thu, 18 Jan 2024 14:44:03 +0700
Subject: [PATCH] AMDGPU: Handle gfx950 global_load_lds_* instructions

Define global_load_lds_dwordx3 and global_load_dwordx4.
Oddly it seems dwordx2 was skipped.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   2 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp  |  10 ++
 llvm/lib/Target/AMDGPU/FLATInstructions.td|   9 ++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |   7 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  10 ++
 .../llvm.amdgcn.global.load.lds.gfx950.ll | 137 ++
 llvm/test/MC/AMDGPU/gfx950_asm_features.s |  37 +
 llvm/test/MC/Disassembler/AMDGPU/gfx950.txt   |  25 
 8 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll
 create mode 100644 llvm/test/MC/AMDGPU/gfx950_asm_features.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx950.txt

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 15f33cdbf92e6e..f43ab50d2ea441 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2452,7 +2452,7 @@ class AMDGPUGlobalLoadLDS :
 [],
 [LLVMQualPointerType<1>,// Base global pointer to load from
  LLVMQualPointerType<3>,// LDS base pointer to store to
- llvm_i32_ty,   // Data byte size: 1/2/4
+ llvm_i32_ty,   // Data byte size: 1/2/4 (/12/16 for 
gfx950)
  llvm_i32_ty,   // imm offset (applied to both global 
and LDS address)
  llvm_i32_ty],  // auxiliary data (imm, cachepolicy 
(bit 0 = sc0,
 //   
bit 1 = sc1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 13de93e829fab2..a6ef0069f134bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3329,6 +3329,16 @@ bool 
AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
   case 4:
 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
 break;
+  case 12:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
+break;
+  case 16:
+if (!Subtarget->hasLDSLoadB96_B128())
+  return false;
+Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
+break;
   }
 
   MachineBasicBlock *MBB = MI.getParent();
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index db74372e9db452..861fcf017d9e4d 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -934,6 +934,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_usho
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_sshort">;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dword">;
 
+let SubtargetPredicate = HasGFX950Insts in {
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dwordx3">;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dwordx4">;
+}
+
 let SubtargetPredicate = isGFX12Plus in {
   defm GLOBAL_ATOMIC_COND_SUB_U32: FLAT_Global_Atomic_Pseudo 
<"global_atomic_cond_sub_u32", VGPR_32, i32>;
   defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo 
<"global_atomic_ordered_add_b64", VReg_64, i64>;
@@ -1980,6 +1985,10 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS 
<0x028, 0x12>;
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Real_AllAddr_LDS <0x02a, 0x14>;
 
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Real_AllAddr_LDS <0x07e, 0x07e>;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Real_AllAddr_LDS <0x07d, 0x07d>;
+
+
 defm GLOBAL_ATOMIC_SWAP   : FLAT_Global_Real_Atomics_vi <0x40>;
 defm GLOBAL_ATOMIC_CMPSWAP: FLAT_Global_Real_Atomics_vi <0x41>;
 defm GLOBAL_ATOMIC_ADD: FLAT_Global_Real_Atomics_vi <0x42>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4a6efe533230b1..f3f96940c1f44b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1289,6 +1289,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // hasGFX940Insts and hasGFX90AInsts are also true.
   bool hasGFX950Insts() const { return GFX950Insts; }
 
+  /// Returns true if the target supports
+  /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
+  /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
+  bool hasLDSLoadB96_B128() const {
+return h

  1   2   >