[llvm-branch-commits] [flang] [flang][OpenMP] Enable delayed privatization by default for `omp.distribute` (PR #131574)

2025-03-17 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-flang-openmp

Author: Kareem Ergawy (ergawy)


Changes

Switches delayed privatization for `omp.distribute` to be on by default: 
controlled by the `-openmp-enable-delayed-privatization` instead of by 
`-openmp-enable-delayed-privatization-staging`

---
Full diff: https://github.com/llvm/llvm-project/pull/131574.diff


4 Files Affected:

- (modified) flang/lib/Lower/OpenMP/OpenMP.cpp (+1-1) 
- (modified) flang/test/Lower/OpenMP/distribute.f90 (+1-1) 
- (modified) flang/test/Lower/OpenMP/order-clause.f90 (+3-3) 
- (modified) flang/test/Transforms/stack-arrays-hlfir.f90 (+1-1) 


``diff
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp 
b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 2cfc1bd88dcef..f753ce1e82288 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2549,7 +2549,7 @@ static void 
genStandaloneDistribute(lower::AbstractConverter &converter,
 
   DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
/*shouldCollectPreDeterminedSymbols=*/true,
-   enableDelayedPrivatizationStaging, symTable);
+   enableDelayedPrivatization, symTable);
   dsp.processStep1(&distributeClauseOps);
 
   mlir::omp::LoopNestOperands loopNestClauseOps;
diff --git a/flang/test/Lower/OpenMP/distribute.f90 
b/flang/test/Lower/OpenMP/distribute.f90
index a4a753dddbac4..ea57d35b964b4 100644
--- a/flang/test/Lower/OpenMP/distribute.f90
+++ b/flang/test/Lower/OpenMP/distribute.f90
@@ -7,7 +7,7 @@ subroutine distribute_simple()
   ! CHECK: omp.teams
   !$omp teams
 
-  ! CHECK: omp.distribute {
+  ! CHECK: omp.distribute private({{.*}}) {
   !$omp distribute
 
   ! CHECK-NEXT: omp.loop_nest
diff --git a/flang/test/Lower/OpenMP/order-clause.f90 
b/flang/test/Lower/OpenMP/order-clause.f90
index 1f678e02708da..d5799079b3759 100644
--- a/flang/test/Lower/OpenMP/order-clause.f90
+++ b/flang/test/Lower/OpenMP/order-clause.f90
@@ -61,15 +61,15 @@ end subroutine do_simd_order_parallel
 
 
 subroutine distribute_order
-   !CHECK: omp.distribute order(reproducible:concurrent) {
+   !CHECK: omp.distribute order(reproducible:concurrent) private({{.*}}) {
!$omp teams distribute order(concurrent)
do i=1,10
end do
-   !CHECK: omp.distribute order(reproducible:concurrent) {
+   !CHECK: omp.distribute order(reproducible:concurrent) private({{.*}}) {
!$omp teams distribute order(reproducible:concurrent)
do i=1,10
end do
-   !CHECK: omp.distribute order(unconstrained:concurrent) {
+   !CHECK: omp.distribute order(unconstrained:concurrent) private({{.*}}) {
!$omp teams distribute order(unconstrained:concurrent)
do i = 1, 10
end do
diff --git a/flang/test/Transforms/stack-arrays-hlfir.f90 
b/flang/test/Transforms/stack-arrays-hlfir.f90
index 06749b7ca88af..e70a1d9b89216 100644
--- a/flang/test/Transforms/stack-arrays-hlfir.f90
+++ b/flang/test/Transforms/stack-arrays-hlfir.f90
@@ -73,7 +73,7 @@ end subroutine omp_target_wsloop
 ! CHECK-NOT:   fir.freemem
 ! CHECK: omp.teams {
 ! CHECK:   fir.alloca !fir.array<2xi64>
-! CHECK: omp.distribute {
+! CHECK: omp.distribute private({{.*}}) {
 ! CHECK: omp.loop_nest {{.*}} {
 ! CHECK-NOT:   fir.allocmem
 ! CHECK-NOT:   fir.freemem

``




https://github.com/llvm/llvm-project/pull/131574
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/20.x: [SCEV] Check whether the start is non-zero in `ScalarEvolution::howFarToZero` (#131522) (PR #131568)

2025-03-17 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: None (llvmbot)


Changes

Backport c5a491e9ea22014b65664b6e09134b4f055933e2

Requested by: @dtcxzyw

---
Full diff: https://github.com/llvm/llvm-project/pull/131568.diff


3 Files Affected:

- (modified) llvm/lib/Analysis/ScalarEvolution.cpp (+5-4) 
- (modified) llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll 
(+26-8) 
- (added) llvm/test/Transforms/LoopUnroll/pr131465.ll (+43) 


``diff
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp 
b/llvm/lib/Analysis/ScalarEvolution.cpp
index c71202c8dd58e..b8069df4e6598 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10635,10 +10635,11 @@ ScalarEvolution::ExitLimit 
ScalarEvolution::howFarToZero(const SCEV *V,
   if (ControlsOnlyExit && AddRec->hasNoSelfWrap() &&
   loopHasNoAbnormalExits(AddRec->getLoop())) {
 
-// If the stride is zero, the loop must be infinite.  In C++, most loops
-// are finite by assumption, in which case the step being zero implies
-// UB must execute if the loop is entered.
-if (!loopIsFiniteByAssumption(L) && !isKnownNonZero(StepWLG))
+// If the stride is zero and the start is non-zero, the loop must be
+// infinite. In C++, most loops are finite by assumption, in which case the
+// step being zero implies UB must execute if the loop is entered.
+if (!(loopIsFiniteByAssumption(L) && isKnownNonZero(Start)) &&
+!isKnownNonZero(StepWLG))
   return getCouldNotCompute();
 
 const SCEV *Exact =
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll 
b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
index 2d02cb6194f4c..1f08a620b2e15 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
@@ -329,10 +329,9 @@ define void @ne_nsw_nonneg_step(ptr nocapture %A, i32 %n, 
i32 %s) mustprogress {
 ;
 ; CHECK-LABEL: 'ne_nsw_nonneg_step'
 ; CHECK-NEXT:  Determining loop execution counts for: @ne_nsw_nonneg_step
-; CHECK-NEXT:  Loop %for.body: backedge-taken count is (((-1 * %s) + %n) /u %s)
-; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (((-1 * 
%s) + %n) /u %s)
-; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
 ;
 entry:
   %nonneg_step = icmp sge i32 %s, 0
@@ -442,10 +441,9 @@ define void @ne_nuw_nonneg_step(ptr nocapture %A, i32 %n, 
i32 %s) mustprogress {
 ;
 ; CHECK-LABEL: 'ne_nuw_nonneg_step'
 ; CHECK-NEXT:  Determining loop execution counts for: @ne_nuw_nonneg_step
-; CHECK-NEXT:  Loop %for.body: backedge-taken count is (((-1 * %s) + %n) /u %s)
-; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (((-1 * 
%s) + %n) /u %s)
-; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
 ;
 entry:
   %nonneg_step = icmp sge i32 %s, 0
@@ -493,6 +491,26 @@ for.end:  ; preds 
= %for.body, %entry
   ret void
 }
 
+define i32 @pr131465(i1 %x) mustprogress {
+; CHECK-LABEL: 'pr131465'
+; CHECK-NEXT:  Determining loop execution counts for: @pr131465
+; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  %inc = zext i1 %x to i32
+  br label %for.body
+
+for.body:
+  %indvar = phi i32 [ 2, %entry ], [ %next, %for.body ]
+  %next = add nsw i32 %indvar, %inc
+  %exitcond = icmp eq i32 %next, 2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 0
+}
 
 declare void @llvm.assume(i1)
 
diff --git a/llvm/test/Transforms/LoopUnroll/pr131465.ll 
b/llvm/test/Transforms/LoopUnroll/pr131465.ll
new file mode 100644
index 0..643b020c6c110
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/pr131465.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 5
+; RUN: opt -S -passes=loop-unroll -unroll-runtime %s | FileCheck %s
+
+define i32 @pr131465(i1 %x) mustprogress {
+; CHECK-LABEL: define i32 @pr131465(
+; CHECK-SAME: i1 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:[[INC:%.*]] = zext i1 [[X]] to i32
+; CHECK-NEXT:br label %[[FOR_BODY:.*]]
+; CHECK:   [[FOR_BODY]]:
+; 

[llvm-branch-commits] [llvm] release/20.x: [SCEV] Check whether the start is non-zero in `ScalarEvolution::howFarToZero` (#131522) (PR #131568)

2025-03-17 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-analysis

Author: None (llvmbot)


Changes

Backport c5a491e9ea22014b65664b6e09134b4f055933e2

Requested by: @dtcxzyw

---
Full diff: https://github.com/llvm/llvm-project/pull/131568.diff


3 Files Affected:

- (modified) llvm/lib/Analysis/ScalarEvolution.cpp (+5-4) 
- (modified) llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll 
(+26-8) 
- (added) llvm/test/Transforms/LoopUnroll/pr131465.ll (+43) 


``diff
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp 
b/llvm/lib/Analysis/ScalarEvolution.cpp
index c71202c8dd58e..b8069df4e6598 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10635,10 +10635,11 @@ ScalarEvolution::ExitLimit 
ScalarEvolution::howFarToZero(const SCEV *V,
   if (ControlsOnlyExit && AddRec->hasNoSelfWrap() &&
   loopHasNoAbnormalExits(AddRec->getLoop())) {
 
-// If the stride is zero, the loop must be infinite.  In C++, most loops
-// are finite by assumption, in which case the step being zero implies
-// UB must execute if the loop is entered.
-if (!loopIsFiniteByAssumption(L) && !isKnownNonZero(StepWLG))
+// If the stride is zero and the start is non-zero, the loop must be
+// infinite. In C++, most loops are finite by assumption, in which case the
+// step being zero implies UB must execute if the loop is entered.
+if (!(loopIsFiniteByAssumption(L) && isKnownNonZero(Start)) &&
+!isKnownNonZero(StepWLG))
   return getCouldNotCompute();
 
 const SCEV *Exact =
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll 
b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
index 2d02cb6194f4c..1f08a620b2e15 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
@@ -329,10 +329,9 @@ define void @ne_nsw_nonneg_step(ptr nocapture %A, i32 %n, 
i32 %s) mustprogress {
 ;
 ; CHECK-LABEL: 'ne_nsw_nonneg_step'
 ; CHECK-NEXT:  Determining loop execution counts for: @ne_nsw_nonneg_step
-; CHECK-NEXT:  Loop %for.body: backedge-taken count is (((-1 * %s) + %n) /u %s)
-; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (((-1 * 
%s) + %n) /u %s)
-; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
 ;
 entry:
   %nonneg_step = icmp sge i32 %s, 0
@@ -442,10 +441,9 @@ define void @ne_nuw_nonneg_step(ptr nocapture %A, i32 %n, 
i32 %s) mustprogress {
 ;
 ; CHECK-LABEL: 'ne_nuw_nonneg_step'
 ; CHECK-NEXT:  Determining loop execution counts for: @ne_nuw_nonneg_step
-; CHECK-NEXT:  Loop %for.body: backedge-taken count is (((-1 * %s) + %n) /u %s)
-; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (((-1 * 
%s) + %n) /u %s)
-; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
 ;
 entry:
   %nonneg_step = icmp sge i32 %s, 0
@@ -493,6 +491,26 @@ for.end:  ; preds 
= %for.body, %entry
   ret void
 }
 
+define i32 @pr131465(i1 %x) mustprogress {
+; CHECK-LABEL: 'pr131465'
+; CHECK-NEXT:  Determining loop execution counts for: @pr131465
+; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  %inc = zext i1 %x to i32
+  br label %for.body
+
+for.body:
+  %indvar = phi i32 [ 2, %entry ], [ %next, %for.body ]
+  %next = add nsw i32 %indvar, %inc
+  %exitcond = icmp eq i32 %next, 2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 0
+}
 
 declare void @llvm.assume(i1)
 
diff --git a/llvm/test/Transforms/LoopUnroll/pr131465.ll 
b/llvm/test/Transforms/LoopUnroll/pr131465.ll
new file mode 100644
index 0..643b020c6c110
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/pr131465.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 5
+; RUN: opt -S -passes=loop-unroll -unroll-runtime %s | FileCheck %s
+
+define i32 @pr131465(i1 %x) mustprogress {
+; CHECK-LABEL: define i32 @pr131465(
+; CHECK-SAME: i1 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:[[INC:%.*]] = zext i1 [[X]] to i32
+; CHECK-NEXT:br label %[[FOR_BODY:.*]]
+; CHECK:   [[FOR_BODY]]:
+; CH

[llvm-branch-commits] [llvm] [llvm] Add option to emit `callgraph` section (PR #87574)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -504,12 +504,18 @@ bool MIRParserImpl::initializeCallSiteInfo(
 return error(Error, ArgRegPair.Reg.SourceRange);
   CSInfo.ArgRegPairs.emplace_back(Reg, ArgRegPair.ArgNo);
 }
+if (YamlCSInfo.TypeId.has_value()) {
+  IntegerType *Int64Ty = Type::getInt64Ty(Context);
+  CSInfo.TypeId = ConstantInt::get(Int64Ty, YamlCSInfo.TypeId.value(),
+   /*isSigned=*/false);
+}

arsenm wrote:

```suggestion
if (YamlCSInfo.TypeId) {
  IntegerType *Int64Ty = Type::getInt64Ty(Context);
  CSInfo.TypeId = ConstantInt::get(Int64Ty, *YamlCSInfo.TypeId,
   /*isSigned=*/false);
}
```

https://github.com/llvm/llvm-project/pull/87574
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [llvm] Add option to emit `callgraph` section (PR #87574)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -504,12 +504,18 @@ bool MIRParserImpl::initializeCallSiteInfo(
 return error(Error, ArgRegPair.Reg.SourceRange);
   CSInfo.ArgRegPairs.emplace_back(Reg, ArgRegPair.ArgNo);
 }
+if (YamlCSInfo.TypeId.has_value()) {
+  IntegerType *Int64Ty = Type::getInt64Ty(Context);
+  CSInfo.TypeId = ConstantInt::get(Int64Ty, YamlCSInfo.TypeId.value(),
+   /*isSigned=*/false);
+}
 
-if (TM.Options.EmitCallSiteInfo)
+if (TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection)
   MF.addCallSiteInfo(&*CallI, std::move(CSInfo));
   }
 
-  if (YamlMF.CallSitesInfo.size() && !TM.Options.EmitCallSiteInfo)
+  if (YamlMF.CallSitesInfo.size() &&

arsenm wrote:

```suggestion
  if (!YamlMF.CallSitesInfo.empty() &&
```

https://github.com/llvm/llvm-project/pull/87574
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [flang][OpenMP] Map simple `do concurrent` loops to OpenMP host constructs (PR #127633)

2025-03-17 Thread Kareem Ergawy via llvm-branch-commits

https://github.com/ergawy updated 
https://github.com/llvm/llvm-project/pull/127633

>From 35dd008e971fb42d457d04f26ca7b8f69c19082d Mon Sep 17 00:00:00 2001
From: ergawy 
Date: Tue, 18 Feb 2025 02:50:46 -0600
Subject: [PATCH 1/3] [flang][OpenMP] Map simple `do concurrent` loops to
 OpenMP host constructs

Upstreams one more part of the ROCm `do concurrent` to OpenMP mapping
pass. This PR add support for converting simple loops to the equivalent
OpenMP constructs on the host: `omp parallel do`. Towards that end, we
have to collect more information about loop nests for which we add new
utils in the `looputils` name space.
---
 flang/docs/DoConcurrentConversionToOpenMP.md  |  47 
 .../OpenMP/DoConcurrentConversion.cpp | 211 +-
 .../Transforms/DoConcurrent/basic_host.f90|  14 +-
 .../Transforms/DoConcurrent/basic_host.mlir   |  62 +
 .../DoConcurrent/non_const_bounds.f90 |  45 
 .../DoConcurrent/not_perfectly_nested.f90 |  45 
 6 files changed, 405 insertions(+), 19 deletions(-)
 create mode 100644 flang/test/Transforms/DoConcurrent/basic_host.mlir
 create mode 100644 flang/test/Transforms/DoConcurrent/non_const_bounds.f90
 create mode 100644 flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90

diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md 
b/flang/docs/DoConcurrentConversionToOpenMP.md
index 7b49af742f242..19611615ee9d6 100644
--- a/flang/docs/DoConcurrentConversionToOpenMP.md
+++ b/flang/docs/DoConcurrentConversionToOpenMP.md
@@ -126,6 +126,53 @@ see the "Data environment" section below.
 See `flang/test/Transforms/DoConcurrent/loop_nest_test.f90` for more examples
 of what is and is not detected as a perfect loop nest.
 
+### Single-range loops
+
+Given the following loop:
+```fortran
+  do concurrent(i=1:n)
+a(i) = i * i
+  end do
+```
+
+ Mapping to `host`
+
+Mapping this loop to the `host`, generates MLIR operations of the following
+structure:
+
+```
+%4 = fir.address_of(@_QFEa) ...
+%6:2 = hlfir.declare %4 ...
+
+omp.parallel {
+  // Allocate private copy for `i`.
+  // TODO Use delayed privatization.
+  %19 = fir.alloca i32 {bindc_name = "i"}
+  %20:2 = hlfir.declare %19 {uniq_name = "_QFEi"} ...
+
+  omp.wsloop {
+omp.loop_nest (%arg0) : index = (%21) to (%22) inclusive step (%c1_2) {
+  %23 = fir.convert %arg0 : (index) -> i32
+  // Use the privatized version of `i`.
+  fir.store %23 to %20#1 : !fir.ref
+  ...
+
+  // Use "shared" SSA value of `a`.
+  %42 = hlfir.designate %6#0
+  hlfir.assign %35 to %42
+  ...
+  omp.yield
+}
+omp.terminator
+  }
+  omp.terminator
+}
+```
+
+ Mapping to `device`
+
+
+
 

[llvm-branch-commits] [llvm] [llvm] Add option to emit `callgraph` section (PR #87574)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -504,12 +504,18 @@ bool MIRParserImpl::initializeCallSiteInfo(
 return error(Error, ArgRegPair.Reg.SourceRange);
   CSInfo.ArgRegPairs.emplace_back(Reg, ArgRegPair.ArgNo);
 }
+if (YamlCSInfo.TypeId.has_value()) {
+  IntegerType *Int64Ty = Type::getInt64Ty(Context);
+  CSInfo.TypeId = ConstantInt::get(Int64Ty, YamlCSInfo.TypeId.value(),
+   /*isSigned=*/false);
+}
 
-if (TM.Options.EmitCallSiteInfo)
+if (TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection)
   MF.addCallSiteInfo(&*CallI, std::move(CSInfo));
   }
 
-  if (YamlMF.CallSitesInfo.size() && !TM.Options.EmitCallSiteInfo)
+  if (YamlMF.CallSitesInfo.size() &&
+  !(TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection))
 return error(Twine("Call site info provided but not used"));

arsenm wrote:

Pre-existing issue but errors should start with lowercase 

https://github.com/llvm/llvm-project/pull/87574
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [llvm] Extract and propagate indirect call type id (PR #87575)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -0,0 +1,29 @@
+;; Tests that call site type ids can be extracted and set from type operand
+;; bundles.
+
+;; Verify the exact typeId value to ensure it is not garbage but the value
+;; computed as the type id from the type operand bundle.
+; RUN: llc --call-graph-section -mtriple aarch64-linux-gnu < %s 
-stop-before=finalize-isel -o - | FileCheck %s
+
+define dso_local void @foo(i8 signext %a) !type !0 {
+entry:
+  ret void
+}
+
+; CHECK: name: main
+define dso_local i32 @main() !type !1 {
+entry:
+  %retval = alloca i32, align 4
+  %fp = alloca ptr, align 8
+  store i32 0, ptr %retval, align 4
+  store ptr @foo, ptr %fp, align 8
+  %fp_val = load ptr, ptr %fp, align 8
+  ; CHECK: callSites:
+  ; CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [], typeId:
+  ; CHECK-NEXT: 7854600665770582568 }
+  call void %fp_val(i8 signext 97) [ "callee_type"(metadata 
!"_ZTSFvcE.generalized") ]
+  ret i32 0
+}
+

arsenm wrote:

Test case with indirect tail call? 

https://github.com/llvm/llvm-project/pull/87575
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [libcxx] [libc++] Implement std::move_only_function (P0288R9) (PR #94670)

2025-03-17 Thread A. Jiang via llvm-branch-commits

https://github.com/frederick-vs-ja edited 
https://github.com/llvm/llvm-project/pull/94670
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [llvm] Extract and propagate indirect call type id (PR #87575)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -0,0 +1,29 @@
+;; Tests that call site type ids can be extracted and set from type operand
+;; bundles.
+
+;; Verify the exact typeId value to ensure it is not garbage but the value
+;; computed as the type id from the type operand bundle.
+; RUN: llc --call-graph-section -mtriple aarch64-linux-gnu < %s 
-stop-before=finalize-isel -o - | FileCheck %s

arsenm wrote:

Why before finalize-isel and not after? Why does this need to check 
intermediate mir at all? 

https://github.com/llvm/llvm-project/pull/87575
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [libcxx] [libc++] Implement std::move_only_function (P0288R9) (PR #94670)

2025-03-17 Thread A. Jiang via llvm-branch-commits

https://github.com/frederick-vs-ja edited 
https://github.com/llvm/llvm-project/pull/94670
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Replace unused permlane inputs with poison instead of undef (PR #131288)

2025-03-17 Thread Nuno Lopes via llvm-branch-commits

https://github.com/nunoplopes approved this pull request.


https://github.com/llvm/llvm-project/pull/131288
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NPM] Port LiveDebugValues to NPM (PR #131563)

2025-03-17 Thread Akshat Oke via llvm-branch-commits

https://github.com/optimisan updated 
https://github.com/llvm/llvm-project/pull/131563

>From e743c8a2a8b1d22455aa6fb62d119fa7f7a6d1f5 Mon Sep 17 00:00:00 2001
From: Akshat Oke 
Date: Wed, 12 Mar 2025 09:31:58 +
Subject: [PATCH 1/2] [CodeGen][NPM] Port LiveDebugValues to NPM

---
 .../llvm/CodeGen/LiveDebugValuesPass.h| 30 +
 llvm/include/llvm/InitializePasses.h  |  2 +-
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |  4 +-
 .../llvm/Passes/MachinePassRegistry.def   | 12 +++-
 llvm/lib/CodeGen/CodeGen.cpp  |  2 +-
 .../LiveDebugValues/LiveDebugValues.cpp   | 63 ++-
 llvm/lib/Passes/PassBuilder.cpp   |  1 +
 llvm/test/CodeGen/ARM/dbg-range-extension.mir |  1 +
 .../compiler-gen-bbs-livedebugvalues.mir  |  3 +
 9 files changed, 98 insertions(+), 20 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/LiveDebugValuesPass.h

diff --git a/llvm/include/llvm/CodeGen/LiveDebugValuesPass.h 
b/llvm/include/llvm/CodeGen/LiveDebugValuesPass.h
new file mode 100644
index 0..023a699360688
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/LiveDebugValuesPass.h
@@ -0,0 +1,30 @@
+//===- llvm/CodeGen/LiveDebugValuesPass.h *- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_CODEGEN_LIVEDEBUGVALUESPASS_H
+#define LLVM_CODEGEN_LIVEDEBUGVALUESPASS_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class LiveDebugValuesPass : public PassInfoMixin {
+  bool ShouldEmitDebugEntryValues;
+
+public:
+  LiveDebugValuesPass(bool ShouldEmitDebugEntryValues)
+  : ShouldEmitDebugEntryValues(ShouldEmitDebugEntryValues) {}
+  PreservedAnalyses run(MachineFunction &MF,
+MachineFunctionAnalysisManager &MFAM);
+  void printPipeline(raw_ostream &OS,
+ function_ref MapClassName2PassName);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_LIVEDEBUGVALUESPASS_H
diff --git a/llvm/include/llvm/InitializePasses.h 
b/llvm/include/llvm/InitializePasses.h
index 460c7eb3ebe24..e820277724393 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -153,7 +153,7 @@ void initializeLegacyLICMPassPass(PassRegistry &);
 void initializeLegalizerPass(PassRegistry &);
 void initializeGISelCSEAnalysisWrapperPassPass(PassRegistry &);
 void initializeGISelKnownBitsAnalysisPass(PassRegistry &);
-void initializeLiveDebugValuesPass(PassRegistry &);
+void initializeLiveDebugValuesLegacyPass(PassRegistry &);
 void initializeLiveDebugVariablesWrapperLegacyPass(PassRegistry &);
 void initializeLiveIntervalsWrapperPassPass(PassRegistry &);
 void initializeLiveRangeShrinkPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h 
b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 571b363fadfc2..bdb81cf77cfd1 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -45,6 +45,7 @@
 #include "llvm/CodeGen/InterleavedAccess.h"
 #include "llvm/CodeGen/InterleavedLoadCombine.h"
 #include "llvm/CodeGen/JMCInstrumenter.h"
+#include "llvm/CodeGen/LiveDebugValuesPass.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LocalStackSlotAllocation.h"
 #include "llvm/CodeGen/LowerEmuTLS.h"
@@ -1002,7 +1003,8 @@ Error CodeGenPassBuilder::addMachinePasses(
   addPass(FuncletLayoutPass());
 
   addPass(StackMapLivenessPass());
-  addPass(LiveDebugValuesPass());
+  addPass(LiveDebugValuesPass(
+  getTM().Options.ShouldEmitDebugEntryValues()));
   addPass(MachineSanitizerBinaryMetadata());
 
   if (TM.Options.EnableMachineOutliner &&
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def 
b/llvm/include/llvm/Passes/MachinePassRegistry.def
index d3320ef82098c..956304560b683 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -214,6 +214,17 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS(
 },
 "enable-tail-merge")
 
+MACHINE_FUNCTION_PASS_WITH_PARAMS(
+"live-debug-values", "LiveDebugValuesPass",
+[](bool ShouldEmitDebugEntryValues) {
+  return LiveDebugValuesPass(ShouldEmitDebugEntryValues);
+},
+[](StringRef Params) {
+  return parseSinglePassOption(Params, "emit-debug-entry-values",
+   "LiveDebugValuesPass");
+},
+"emit-debug-entry-values")
+
 MACHINE_FUNCTION_PASS_WITH_PARAMS(
 "machine-sink", "MachineSinkingPass",
 [](bool EnableSinkAndFold) {
@@ -278,7 +289,6 @@ DUMMY_MACHINE_FUNCTION_PASS("instruction-select", 
InstructionSelectPass)
 DUMMY_MACHINE_FUNCTION_PASS("irtranslator", IRTranslatorPass)
 DUMMY_MACHINE_FUNCTION_PASS("kcfi", MachineKCFIPass)
 DUM

[llvm-branch-commits] [llvm] [CodeGen][NPM] Port LiveDebugValues to NPM (PR #131563)

2025-03-17 Thread Akshat Oke via llvm-branch-commits

https://github.com/optimisan updated 
https://github.com/llvm/llvm-project/pull/131563

>From e743c8a2a8b1d22455aa6fb62d119fa7f7a6d1f5 Mon Sep 17 00:00:00 2001
From: Akshat Oke 
Date: Wed, 12 Mar 2025 09:31:58 +
Subject: [PATCH 1/2] [CodeGen][NPM] Port LiveDebugValues to NPM

---
 .../llvm/CodeGen/LiveDebugValuesPass.h| 30 +
 llvm/include/llvm/InitializePasses.h  |  2 +-
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |  4 +-
 .../llvm/Passes/MachinePassRegistry.def   | 12 +++-
 llvm/lib/CodeGen/CodeGen.cpp  |  2 +-
 .../LiveDebugValues/LiveDebugValues.cpp   | 63 ++-
 llvm/lib/Passes/PassBuilder.cpp   |  1 +
 llvm/test/CodeGen/ARM/dbg-range-extension.mir |  1 +
 .../compiler-gen-bbs-livedebugvalues.mir  |  3 +
 9 files changed, 98 insertions(+), 20 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/LiveDebugValuesPass.h

diff --git a/llvm/include/llvm/CodeGen/LiveDebugValuesPass.h 
b/llvm/include/llvm/CodeGen/LiveDebugValuesPass.h
new file mode 100644
index 0..023a699360688
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/LiveDebugValuesPass.h
@@ -0,0 +1,30 @@
+//===- llvm/CodeGen/LiveDebugValuesPass.h *- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_CODEGEN_LIVEDEBUGVALUESPASS_H
+#define LLVM_CODEGEN_LIVEDEBUGVALUESPASS_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class LiveDebugValuesPass : public PassInfoMixin {
+  bool ShouldEmitDebugEntryValues;
+
+public:
+  LiveDebugValuesPass(bool ShouldEmitDebugEntryValues)
+  : ShouldEmitDebugEntryValues(ShouldEmitDebugEntryValues) {}
+  PreservedAnalyses run(MachineFunction &MF,
+MachineFunctionAnalysisManager &MFAM);
+  void printPipeline(raw_ostream &OS,
+ function_ref MapClassName2PassName);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_LIVEDEBUGVALUESPASS_H
diff --git a/llvm/include/llvm/InitializePasses.h 
b/llvm/include/llvm/InitializePasses.h
index 460c7eb3ebe24..e820277724393 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -153,7 +153,7 @@ void initializeLegacyLICMPassPass(PassRegistry &);
 void initializeLegalizerPass(PassRegistry &);
 void initializeGISelCSEAnalysisWrapperPassPass(PassRegistry &);
 void initializeGISelKnownBitsAnalysisPass(PassRegistry &);
-void initializeLiveDebugValuesPass(PassRegistry &);
+void initializeLiveDebugValuesLegacyPass(PassRegistry &);
 void initializeLiveDebugVariablesWrapperLegacyPass(PassRegistry &);
 void initializeLiveIntervalsWrapperPassPass(PassRegistry &);
 void initializeLiveRangeShrinkPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h 
b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 571b363fadfc2..bdb81cf77cfd1 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -45,6 +45,7 @@
 #include "llvm/CodeGen/InterleavedAccess.h"
 #include "llvm/CodeGen/InterleavedLoadCombine.h"
 #include "llvm/CodeGen/JMCInstrumenter.h"
+#include "llvm/CodeGen/LiveDebugValuesPass.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LocalStackSlotAllocation.h"
 #include "llvm/CodeGen/LowerEmuTLS.h"
@@ -1002,7 +1003,8 @@ Error CodeGenPassBuilder::addMachinePasses(
   addPass(FuncletLayoutPass());
 
   addPass(StackMapLivenessPass());
-  addPass(LiveDebugValuesPass());
+  addPass(LiveDebugValuesPass(
+  getTM().Options.ShouldEmitDebugEntryValues()));
   addPass(MachineSanitizerBinaryMetadata());
 
   if (TM.Options.EnableMachineOutliner &&
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def 
b/llvm/include/llvm/Passes/MachinePassRegistry.def
index d3320ef82098c..956304560b683 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -214,6 +214,17 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS(
 },
 "enable-tail-merge")
 
+MACHINE_FUNCTION_PASS_WITH_PARAMS(
+"live-debug-values", "LiveDebugValuesPass",
+[](bool ShouldEmitDebugEntryValues) {
+  return LiveDebugValuesPass(ShouldEmitDebugEntryValues);
+},
+[](StringRef Params) {
+  return parseSinglePassOption(Params, "emit-debug-entry-values",
+   "LiveDebugValuesPass");
+},
+"emit-debug-entry-values")
+
 MACHINE_FUNCTION_PASS_WITH_PARAMS(
 "machine-sink", "MachineSinkingPass",
 [](bool EnableSinkAndFold) {
@@ -278,7 +289,6 @@ DUMMY_MACHINE_FUNCTION_PASS("instruction-select", 
InstructionSelectPass)
 DUMMY_MACHINE_FUNCTION_PASS("irtranslator", IRTranslatorPass)
 DUMMY_MACHINE_FUNCTION_PASS("kcfi", MachineKCFIPass)
 DUM

[llvm-branch-commits] [llvm] release/20.x: [SCEV] Check whether the start is non-zero in `ScalarEvolution::howFarToZero` (#131522) (PR #131568)

2025-03-17 Thread Nikita Popov via llvm-branch-commits

https://github.com/nikic approved this pull request.


https://github.com/llvm/llvm-project/pull/131568
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [libcxx] [libc++][format] Implements P3107R5 in . (PR #130500)

2025-03-17 Thread Mark de Wever via llvm-branch-commits

https://github.com/mordante updated 
https://github.com/llvm/llvm-project/pull/130500

>From c132aa555a38efde9b04c2a3f435ba598778c28d Mon Sep 17 00:00:00 2001
From: Mark de Wever 
Date: Sat, 30 Mar 2024 17:35:56 +0100
Subject: [PATCH 1/3] [libc++][format] Implements P3107R5 in .

The followup paper P3235R3 which is voted in as a DR changes the names
foo_locking to foo_buffered. These changes have been applied in this
patch.

Before
---
Benchmark Time CPU   Iterations
---
printf 71.3 ns 71.3 ns  9525175
print_string226 ns  226 ns  3105850
print_stack 232 ns  232 ns  3026498
print_direct530 ns  530 ns  1318447

After
---
Benchmark Time CPU   Iterations
---
printf 70.6 ns 70.6 ns  9789585
print_string222 ns  222 ns  3147678
print_stack 227 ns  227 ns  3084767
print_direct474 ns  474 ns  1472786

Note: The performance of libc++'s std::print is still extemely slow
compared to printf. Based on P3107R5 std::print should outperform
printf. The main culprit is the call to isatty, which is resolved
after implementing
LWG4044  Confusing requirements for std::print on POSIX platforms

Implements
- P3107R5 - Permit an efficient implementation of ``std::print``

Implements parts of
- P3235R3 std::print more types faster with less memory

Fixes: #105435
---
 libcxx/docs/ReleaseNotes/21.rst   |   1 +
 libcxx/include/__format/buffer.h  |   3 +
 libcxx/include/print  | 270 +-
 libcxx/modules/std/print.inc  |   1 +
 .../test/libcxx/system_reserved_names.gen.py  |   5 +
 .../test/libcxx/transitive_includes/cxx03.csv |   5 +
 .../test/libcxx/transitive_includes/cxx11.csv |   5 +
 .../test/libcxx/transitive_includes/cxx14.csv |   5 +
 .../test/libcxx/transitive_includes/cxx17.csv |   5 +
 .../test/libcxx/transitive_includes/cxx23.csv |   5 +-
 .../test/libcxx/transitive_includes/cxx26.csv |   4 +
 11 files changed, 296 insertions(+), 13 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index e7cfa625a132c..a1f30b26c5a1d 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -40,6 +40,7 @@ Implemented Papers
 
 - N4258: Cleaning-up noexcept in the Library (`Github 
`__)
 - P1361R2: Integration of chrono with text formatting (`Github 
`__)
+- P3107R5 - Permit an efficient implementation of ``std::print`` (`Github 
`__)
 
 Improvements and New Features
 -
diff --git a/libcxx/include/__format/buffer.h b/libcxx/include/__format/buffer.h
index c88b7f3222010..d6e4ddc840e2d 100644
--- a/libcxx/include/__format/buffer.h
+++ b/libcxx/include/__format/buffer.h
@@ -12,6 +12,7 @@
 
 #include <__algorithm/copy_n.h>
 #include <__algorithm/fill_n.h>
+#include <__algorithm/for_each.h>
 #include <__algorithm/max.h>
 #include <__algorithm/min.h>
 #include <__algorithm/ranges_copy.h>
@@ -34,11 +35,13 @@
 #include <__memory/construct_at.h>
 #include <__memory/destroy.h>
 #include <__memory/uninitialized_algorithms.h>
+#include <__system_error/system_error.h>
 #include <__type_traits/add_pointer.h>
 #include <__type_traits/conditional.h>
 #include <__utility/exception_guard.h>
 #include <__utility/move.h>
 #include 
+#include  // Uses the POSIX/Windows unlocked stream I/O
 #include 
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/print b/libcxx/include/print
index 1794d6014efcd..f6d03edfbd4bc 100644
--- a/libcxx/include/print
+++ b/libcxx/include/print
@@ -27,9 +27,11 @@ namespace std {
 
   void vprint_unicode(string_view fmt, format_args args);
   void vprint_unicode(FILE* stream, string_view fmt, format_args args);
+  void vprint_unicode_buffered(FILE* stream, string_view fmt, format_args 
args);
 
   void vprint_nonunicode(string_view fmt, format_args args);
   void vprint_nonunicode(FILE* stream, string_view fmt, format_args args);
+  void vprint_nonunicode_buffered(FILE* stream, string_view fmt, format_args 
args);
 }
 */
 
@@ -41,6 +43,7 @@ namespace std {
 #  include <__config>
 #  include <__system_error/throw_system_error.h>
 #  include <__utility/forward.h>
+#  include <__utility/move.h>
 #  include 
 #  include 
 #  include 
@@ -52,6 +55,9 @@ namespace std {
 #pragma GCC system_header
 #  endif
 
+_LIBCPP_PUSH_MACROS
+#  include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #  ifdef _LIBCPP_WIN32API
@@ -213,6 +219,122 @@ _LIBCPP_HIDE_FROM_ABI inline

[llvm-branch-commits] [clang] release/20.x: [Clang] Do not emit nodiscard warnings for the base expr of static member access (#131450) (PR #131474)

2025-03-17 Thread via llvm-branch-commits

https://github.com/cor3ntin updated 
https://github.com/llvm/llvm-project/pull/131474

>From e46c31e5a5d2aae2fcfc8d835681fcb58ea4c505 Mon Sep 17 00:00:00 2001
From: cor3ntin 
Date: Sat, 15 Mar 2025 22:27:08 +0100
Subject: [PATCH 1/2] [Clang] Do not emit nodiscard warnings for the base expr
 of static member access (#131450)

For an expression `nodiscard_function().static_member(), the nodiscard
warnings added by #120223, are not useful or actionable, and are
disruptive to some library implementations; we just remove them.

Fixes #131410

(cherry picked from commit 9a1e39062b2ab445f1f4424ecdc5ffb46e8cb9e0)
---
 clang/include/clang/Sema/Sema.h|  5 -
 clang/lib/Sema/SemaExprMember.cpp  |  1 -
 clang/lib/Sema/SemaStmt.cpp|  4 
 .../CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp | 10 ++
 clang/test/SemaCXX/ms-property.cpp |  2 +-
 5 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index a30a7076ea5d4..6e2e5aaff2347 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -10671,11 +10671,6 @@ class Sema final : public SemaBase {
SourceLocation EndLoc);
   void ActOnForEachDeclStmt(DeclGroupPtrTy Decl);
 
-  /// DiagnoseDiscardedExprMarkedNodiscard - Given an expression that is
-  /// semantically a discarded-value expression, diagnose if any [[nodiscard]]
-  /// value has been discarded.
-  void DiagnoseDiscardedExprMarkedNodiscard(const Expr *E);
-
   /// DiagnoseUnusedExprResult - If the statement passed in is an expression
   /// whose result is unused, warn.
   void DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID);
diff --git a/clang/lib/Sema/SemaExprMember.cpp 
b/clang/lib/Sema/SemaExprMember.cpp
index d130e8b86bc56..adb8e3cc90c0c 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -1136,7 +1136,6 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType 
BaseExprType,
 if (Converted.isInvalid())
   return true;
 BaseExpr = Converted.get();
-DiagnoseDiscardedExprMarkedNodiscard(BaseExpr);
 return false;
   };
   auto ConvertBaseExprToGLValue = [&] {
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 947651d514b3b..b8b59793d6508 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -413,10 +413,6 @@ void DiagnoseUnused(Sema &S, const Expr *E, 
std::optional DiagID) {
 }
 } // namespace
 
-void Sema::DiagnoseDiscardedExprMarkedNodiscard(const Expr *E) {
-  DiagnoseUnused(*this, E, std::nullopt);
-}
-
 void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) {
   if (const LabelStmt *Label = dyn_cast_if_present(S))
 S = Label->getSubStmt();
diff --git a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp 
b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
index 18f4bd5e9c0fa..0012ab976baa5 100644
--- a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
+++ b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
@@ -164,19 +164,21 @@ struct X {
 
 [[nodiscard]] X get_X();
 // cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}}
+[[nodiscard]] X* get_Ptr();
+// cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}}
 void f() {
+  get_X(); // expected-warning{{ignoring return value of function declared 
with 'nodiscard' attribute}}
+  (void) get_X();
   (void) get_X().variant_member;
   (void) get_X().anonymous_struct_member;
   (void) get_X().data_member;
   (void) get_X().static_data_member;
-  // expected-warning@-1 {{ignoring return value of function declared with 
'nodiscard' attribute}}
   (void) get_X().unscoped_enum;
-  // expected-warning@-1 {{ignoring return value of function declared with 
'nodiscard' attribute}}
   (void) get_X().scoped_enum;
-  // expected-warning@-1 {{ignoring return value of function declared with 
'nodiscard' attribute}}
   (void) get_X().implicit_object_member_function();
   (void) get_X().static_member_function();
-  // expected-warning@-1 {{ignoring return value of function declared with 
'nodiscard' attribute}}
+  (void) get_Ptr()->implicit_object_member_function();
+  (void) get_Ptr()->static_member_function();
 #if __cplusplus >= 202302L
   (void) get_X().explicit_object_member_function();
 #endif
diff --git a/clang/test/SemaCXX/ms-property.cpp 
b/clang/test/SemaCXX/ms-property.cpp
index d5799a8a4d363..f1424b9cb12bc 100644
--- a/clang/test/SemaCXX/ms-property.cpp
+++ b/clang/test/SemaCXX/ms-property.cpp
@@ -2,6 +2,7 @@
 // RUN: %clang_cc1 -triple=x86_64-pc-win32 -fms-compatibility -emit-pch -o %t 
-verify %s
 // RUN: %clang_cc1 -triple=x86_64-pc-win32 -fms-compatibility -include-pch %t 
%s -ast-print -o - | FileCheck %s
 // RUN: %clang_cc1 -fdeclspec -fsyntax-only -verify %s -std=c++23
+// expected-no-diagnostics
 
 #ifndef HEADER
 #

[llvm-branch-commits] [clang] [Clang][CodeGen] Do not promote if complex divisor is real (PR #131451)

2025-03-17 Thread Eli Friedman via llvm-branch-commits

https://github.com/efriedma-quic edited 
https://github.com/llvm/llvm-project/pull/131451
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [Clang][CodeGen] Do not promote if complex divisor is real (PR #131451)

2025-03-17 Thread Eli Friedman via llvm-branch-commits


@@ -314,7 +313,7 @@ class ComplexExprEmitter
   }
 
   QualType getPromotionType(FPOptionsOverride Features, QualType Ty,
-bool IsDivOpCode = false) {
+bool IsComplexDivisor = false) {

efriedma-quic wrote:

Maybe delete the `= false`?  Default arguments tends to be confusing in cases 
like this.

https://github.com/llvm/llvm-project/pull/131451
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [llvm] Add option to emit `callgraph` section (PR #87574)

2025-03-17 Thread Paul Kirth via llvm-branch-commits


@@ -0,0 +1,93 @@
+# Test MIR printer and parser for type id field in callSites. It is used
+# for propogating call site type identifiers to emit in the call graph section.
+
+# RUN: llc --call-graph-section %s -run-pass=none -o - | FileCheck %s
+# CHECK: name: main
+# CHECK: callSites:
+# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: []
+# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [], typeId:
+# CHECK-NEXT: 1234567890 }
+
+--- |  
+  define dso_local noundef i32 @_Z3addii(i32 noundef %a, i32 noundef %b) !type 
!0 !type !0 {
+  entry:
+%a.addr = alloca i32, align 4
+%b.addr = alloca i32, align 4
+store i32 %a, ptr %a.addr, align 4
+store i32 %b, ptr %b.addr, align 4
+%a_val = load i32, ptr %a.addr, align 4
+%b_val = load i32, ptr %b.addr, align 4
+%add = add nsw i32 %a_val, %b_val
+ret i32 %add
+  }
+  
+  define dso_local noundef i32 @_Z8multiplyii(i32 noundef %a, i32 noundef %b) 
!type !0 !type !0 {
+  entry:
+%a.addr = alloca i32, align 4
+%b.addr = alloca i32, align 4
+store i32 %a, ptr %a.addr, align 4
+store i32 %b, ptr %b.addr, align 4
+%a_val = load i32, ptr %a.addr, align 4
+%b_val = load i32, ptr %b.addr, align 4
+%mul = mul nsw i32 %a_val, %b_val
+ret i32 %mul
+  }
+  
+  define dso_local noundef ptr @_Z13get_operationb(i1 noundef zeroext 
%is_addition) !type !1 !type !1 {
+  entry:
+%is_addition.addr = alloca i8, align 1
+%storedv = zext i1 %is_addition to i8
+store i8 %storedv, ptr %is_addition.addr, align 1
+%is_addition_val = load i8, ptr %is_addition.addr, align 1
+%loadedv = trunc i8 %is_addition_val to i1
+br i1 %loadedv, label %cond.end, label %cond.false
+  
+  cond.false:   ; preds = %entry
+br label %cond.end
+  
+  cond.end: ; preds = %cond.false, 
%entry
+%cond = phi ptr [ @_Z8multiplyii, %cond.false ], [ @_Z3addii, %entry ]
+ret ptr %cond
+  }
+  
+  define dso_local noundef i32 @main(i32 noundef %argc) !type !2 !type !2 {
+  entry:
+%retval = alloca i32, align 4
+%argc.addr = alloca i32, align 4
+%x = alloca i32, align 4
+%y = alloca i32, align 4
+%op = alloca ptr, align 8
+store i32 0, ptr %retval, align 4
+store i32 %argc, ptr %argc.addr, align 4
+store i32 5, ptr %x, align 4
+store i32 10, ptr %y, align 4
+%argc_val = load i32, ptr %argc.addr, align 4
+%rem = srem i32 %argc_val, 2
+%cmp = icmp eq i32 %rem, 0
+%call = call noundef ptr @_Z13get_operationb(i1 noundef zeroext %cmp) [ 
"callee_type"(metadata !"_ZTSFPvbE.generalized") ]
+store ptr %call, ptr %op, align 8
+%op_val = load ptr, ptr %op, align 8
+%x_val = load i32, ptr %x, align 4
+%y_val = load i32, ptr %y, align 4
+%call1 = call noundef i32 %op_val(i32 noundef %x_val, i32 noundef %y_val) 
[ "callee_type"(metadata !"_ZTSFiiiE.generalized") ]
+ret i32 %call1
+  }
+  
+  !0 = !{i64 0, !"_ZTSFiiiE.generalized"}
+  !1 = !{i64 0, !"_ZTSFPvbE.generalized"}
+  !2 = !{i64 0, !"_ZTSFiiE.generalized"}
+
+...
+---
+name:main
+callSites:
+  - { bb: 0, offset: 0, fwdArgRegs: [] }
+  - { bb: 0, offset: 2, fwdArgRegs: [], typeId: 
+1234567890 }

ilovepi wrote:

nit: does this need to be on 2 lines? I know that's what the tools spit out, 
but it seems unnecessary. If it makes the checks harder or more confusing, then 
feel free to ignore.

https://github.com/llvm/llvm-project/pull/87574
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [llvm] Add option to emit `callgraph` section (PR #87574)

2025-03-17 Thread Paul Kirth via llvm-branch-commits


@@ -496,6 +496,9 @@ class LLVM_ABI MachineFunction {
   struct CallSiteInfo {
 /// Vector of call argument and its forwarding register.
 SmallVector ArgRegPairs;
+
+/// Callee type id.
+ConstantInt *TypeId = nullptr;

ilovepi wrote:

```suggestion
ConstantInt *CalleeTypeId = nullptr;
```
Seems less ambiguous.

https://github.com/llvm/llvm-project/pull/87574
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [llvm] Extract and propagate indirect call type id (PR #87575)

2025-03-17 Thread Paul Kirth via llvm-branch-commits

https://github.com/ilovepi commented:

LGTM on the RISC-V additions, but do follow through on @arsenm's comments 
before landing.

https://github.com/llvm/llvm-project/pull/87575
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [llvm] Add option to emit `callgraph` section (PR #87574)

2025-03-17 Thread Paul Kirth via llvm-branch-commits


@@ -576,6 +576,9 @@ void 
MIRPrinter::convertCallSiteObjects(yaml::MachineFunction &YMF,
   printRegMIR(ArgReg.Reg, YmlArgReg.Reg, TRI);
   YmlCS.ArgForwardingRegs.emplace_back(YmlArgReg);
 }
+// Get type id.
+if (CSInfo.second.TypeId)
+  YmlCS.TypeId = CSInfo.second.TypeId->getZExtValue();

ilovepi wrote:

This would be nicer w/ C++17 destructuring. That's probably a bit invasive for 
this patch since you'd have to change a bunch of unrelated code, but is 
something that could land as a separate NFC PR relatively quickly.

https://github.com/llvm/llvm-project/pull/87574
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [Clang][CodeGen] Do not promote if complex divisor is real (PR #131451)

2025-03-17 Thread Eli Friedman via llvm-branch-commits

https://github.com/efriedma-quic approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/131451
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] 0b23d98 - Reduce memory usage in AST parent map generation by lazily checking if nodes have been seen (#129934)

2025-03-17 Thread Tom Stellard via llvm-branch-commits

Author: higher-performance
Date: 2025-03-17T14:46:32-07:00
New Revision: 0b23d98dceaa9f965bfa196a6adfa38b1b8bda8e

URL: 
https://github.com/llvm/llvm-project/commit/0b23d98dceaa9f965bfa196a6adfa38b1b8bda8e
DIFF: 
https://github.com/llvm/llvm-project/commit/0b23d98dceaa9f965bfa196a6adfa38b1b8bda8e.diff

LOG: Reduce memory usage in AST parent map generation by lazily checking if 
nodes have been seen (#129934)

This mitigates a regression introduced in #87824.

The mitigation here is to store pointers the deduplicated AST nodes, rather 
than copies of the nodes themselves. This allows a pointer-optimized set to be 
used and saves a lot of memory because `clang::DynTypedNode` is ~5 times larger 
than a pointer.

Fixes #129808.

(cherry picked from commit 8c7f0eaa6ee3f84e3d8260535cced234bed4fa28)

Added: 


Modified: 
clang/lib/AST/ParentMapContext.cpp

Removed: 




diff  --git a/clang/lib/AST/ParentMapContext.cpp 
b/clang/lib/AST/ParentMapContext.cpp
index 7ff492443031d..d8dd352c42d6b 100644
--- a/clang/lib/AST/ParentMapContext.cpp
+++ b/clang/lib/AST/ParentMapContext.cpp
@@ -12,10 +12,11 @@
 
//===--===//
 
 #include "clang/AST/ParentMapContext.h"
-#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/TemplateBase.h"
+#include "llvm/ADT/SmallPtrSet.h"
 
 using namespace clang;
 
@@ -69,17 +70,21 @@ class ParentMapContext::ParentMap {
   for (; N > 0; --N)
 push_back(Value);
 }
-bool contains(const DynTypedNode &Value) {
-  return Seen.contains(Value);
+bool contains(const DynTypedNode &Value) const {
+  const void *Identity = Value.getMemoizationData();
+  assert(Identity);
+  return Dedup.contains(Identity);
 }
 void push_back(const DynTypedNode &Value) {
-  if (!Value.getMemoizationData() || Seen.insert(Value).second)
+  const void *Identity = Value.getMemoizationData();
+  if (!Identity || Dedup.insert(Identity).second) {
 Items.push_back(Value);
+  }
 }
 llvm::ArrayRef view() const { return Items; }
   private:
-llvm::SmallVector Items;
-llvm::SmallDenseSet Seen;
+llvm::SmallVector Items;
+llvm::SmallPtrSet Dedup;
   };
 
   /// Maps from a node to its parents. This is used for nodes that have



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] release/20.x: Reduce memory usage in AST parent map generation by lazily checking if nodes have been seen (#129934) (PR #131209)

2025-03-17 Thread via llvm-branch-commits

github-actions[bot] wrote:

@higher-performance (or anyone else). If you would like to add a note about 
this fix in the release notes (completely optional). Please reply to this 
comment with a one or two sentence description of the fix.  When you are done, 
please add the release:note label to this PR. 

https://github.com/llvm/llvm-project/pull/131209
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] Backport/20.x: [Clang] Fix an incorrect assumption on getTemplatedDecl() (PR #131729)

2025-03-17 Thread via llvm-branch-commits

https://github.com/cor3ntin approved this pull request.


https://github.com/llvm/llvm-project/pull/131729
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] [llvm] [ctxprof] Track unhandled call targets (PR #131417)

2025-03-17 Thread Snehasish Kumar via llvm-branch-commits


@@ -83,6 +84,20 @@ struct ContextRoot {
   // Count the number of entries - regardless if we could take the `Taken` 
mutex
   ::__sanitizer::atomic_uint64_t TotalEntries = {};
 
+  // Profiles for functions we encounter when collecting a contexutal profile,
+  // that are not associated with a callsite. This is expected to happen for
+  // signal handlers, but it also - problematically - currently happens for
+  // mem{memset|copy|move|set}, which are currently inserted after profile

snehasish wrote:

Maybe just generalize this statement to "it also happens for functions 
generated after profile instrumention e.g. mem{set|copy|move}".

https://github.com/llvm/llvm-project/pull/131417
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] [llvm] [ctxprof] Track unhandled call targets (PR #131417)

2025-03-17 Thread Snehasish Kumar via llvm-branch-commits


@@ -246,22 +246,37 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
 
 ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
  uint32_t NumCounters) {
-  // 1) if we are under a root (regardless if this thread is collecting or not 
a
+
+  // 1) if we are currently collecting a contextual profile, fetch a 
ContextNode
+  // in the `Unhandled` set. We want to do this regardless of 
`ProfilingStarted`
+  // to (hopefully) offset the penalty of creating these contexts to before
+  // profiling.
+  //
+  // 2) if we are under a root (regardless if this thread is collecting or not 
a
   // contextual profile for that root), do not collect a flat profile. We want
   // to keep flat profiles only for activations that can't happen under a root,
   // to avoid confusing profiles. We can, for example, combine flattened and
   // flat profiles meaningfully, as we wouldn't double-count anything.
   //
-  // 2) to avoid lengthy startup, don't bother with flat profiles until the
+  // 3) to avoid lengthy startup, don't bother with flat profiles until the

snehasish wrote:

"the profiling has started"? 

https://github.com/llvm/llvm-project/pull/131417
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [GlobalISel] Combine redundant sext_inreg (PR #131624)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -372,3 +372,30 @@ bool CombinerHelper::matchCastOfInteger(const MachineInstr 
&CastMI,
 return false;
   }
 }
+
+void CombinerHelper::applyRedundantSextInReg(MachineInstr &Root,
+ MachineInstr &Other) const {
+  assert(Root.getOpcode() == TargetOpcode::G_SEXT_INREG &&
+ Other.getOpcode() == TargetOpcode::G_SEXT_INREG);
+
+  unsigned RootWidth = Root.getOperand(2).getImm();
+  unsigned OtherWidth = Other.getOperand(2).getImm();
+
+  Register Dst = Root.getOperand(0).getReg();
+  Register OtherDst = Other.getOperand(0).getReg();
+  Register Src = Other.getOperand(1).getReg();
+
+  if (RootWidth >= OtherWidth) {
+// The root sext_inreg is entirely redundant because the other one
+// is narrower.
+Observer.changingAllUsesOfReg(MRI, Dst);
+MRI.replaceRegWith(Dst, OtherDst);

arsenm wrote:

Missing canReplaceReg check? This could break after regbank 

https://github.com/llvm/llvm-project/pull/131624
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Add sext_trunc in RegBankCombiner (PR #131623)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm commented:

Missing test changes? 

https://github.com/llvm/llvm-project/pull/131623
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] [llvm] [ctxprof] Track unhandled call targets (PR #131417)

2025-03-17 Thread Mircea Trofin via llvm-branch-commits

https://github.com/mtrofin updated 
https://github.com/llvm/llvm-project/pull/131417

>From 82e70eef49a048788bc5413e44461f2b7f96cf91 Mon Sep 17 00:00:00 2001
From: Mircea Trofin 
Date: Fri, 14 Mar 2025 15:59:22 -0700
Subject: [PATCH] [ctxprof] Track unhandled call targets

---
 .../lib/ctx_profile/CtxInstrContextNode.h |  1 +
 .../lib/ctx_profile/CtxInstrProfiling.cpp | 36 +++---
 .../lib/ctx_profile/CtxInstrProfiling.h   | 15 +
 .../tests/CtxInstrProfilingTest.cpp   |  2 +-
 .../TestCases/generate-context.cpp| 15 -
 .../llvm/ProfileData/CtxInstrContextNode.h|  1 +
 .../llvm/ProfileData/PGOCtxProfReader.h   | 26 +---
 .../llvm/ProfileData/PGOCtxProfWriter.h   |  6 +-
 llvm/lib/ProfileData/PGOCtxProfReader.cpp | 65 ++-
 llvm/lib/ProfileData/PGOCtxProfWriter.cpp | 27 ++--
 .../Instrumentation/PGOInstrumentation.cpp| 13 
 .../Inputs/valid-with-unhandled.yaml  | 26 
 .../llvm-ctxprof-util/llvm-ctxprof-util.test  | 21 --
 .../PGOCtxProfReaderWriterTest.cpp| 14 ++--
 14 files changed, 212 insertions(+), 56 deletions(-)
 create mode 100644 
llvm/test/tools/llvm-ctxprof-util/Inputs/valid-with-unhandled.yaml

diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h 
b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
index 55962df57fb58..a176662b5cb3d 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
@@ -121,6 +121,7 @@ class ProfileWriter {
 public:
   virtual void startContextSection() = 0;
   virtual void writeContextual(const ctx_profile::ContextNode &RootNode,
+   const ctx_profile::ContextNode *Unhandled,
uint64_t TotalRootEntryCount) = 0;
   virtual void endContextSection() = 0;
 
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp 
b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index 6ef7076d93e31..c97229dd7f043 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -246,22 +246,37 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
 
 ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
  uint32_t NumCounters) {
-  // 1) if we are under a root (regardless if this thread is collecting or not 
a
+
+  // 1) if we are currently collecting a contextual profile, fetch a 
ContextNode
+  // in the `Unhandled` set. We want to do this regardless of 
`ProfilingStarted`
+  // to (hopefully) offset the penalty of creating these contexts to before
+  // profiling.
+  //
+  // 2) if we are under a root (regardless if this thread is collecting or not 
a
   // contextual profile for that root), do not collect a flat profile. We want
   // to keep flat profiles only for activations that can't happen under a root,
   // to avoid confusing profiles. We can, for example, combine flattened and
   // flat profiles meaningfully, as we wouldn't double-count anything.
   //
-  // 2) to avoid lengthy startup, don't bother with flat profiles until the
-  // profiling started. We would reset them anyway when profiling starts.
+  // 3) to avoid lengthy startup, don't bother with flat profiles until the
+  // profiling has started. We would reset them anyway when profiling starts.
   // HOWEVER. This does lose profiling for message pumps: those functions are
   // entered once and never exit. They should be assumed to be entered before
   // profiling starts - because profiling should start after the server is up
   // and running (which is equivalent to "message pumps are set up").
-  if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
-return TheScratchContext;
-  return markAsScratch(
-  onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
+  ContextRoot *R = __llvm_ctx_profile_current_context_root;
+  if (!R) {
+if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
+  return TheScratchContext;
+else
+  return markAsScratch(
+  onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
+  }
+  auto [Iter, Ins] = R->Unhandled.insert({Guid, nullptr});
+  if (Ins)
+Iter->second =
+getCallsiteSlow(Guid, &R->FirstUnhandledCalleeNode, NumCounters, 0);
+  return markAsScratch(onContextEnter(*Iter->second));
 }
 
 ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
@@ -396,6 +411,8 @@ void __llvm_ctx_profile_start_collection() {
   ++NumMemUnits;
 
 resetContextNode(*Root->FirstNode);
+if (Root->FirstUnhandledCalleeNode)
+  resetContextNode(*Root->FirstUnhandledCalleeNode);
 __sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0);
   }
   __sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
@@ -416,8 +433,9 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
   __sanitizer::Printf("[ctxprof]

[llvm-branch-commits] [clang] Backport/20.x: [Clang] Fix an incorrect assumption on getTemplatedDecl() (PR #131729)

2025-03-17 Thread Younan Zhang via llvm-branch-commits

https://github.com/zyn0217 milestoned 
https://github.com/llvm/llvm-project/pull/131729
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] [llvm] [ctxprof] Track unhandled call targets (PR #131417)

2025-03-17 Thread Snehasish Kumar via llvm-branch-commits


@@ -265,7 +275,16 @@ Error llvm::createCtxProfFromYAML(StringRef Profile, 
raw_ostream &Out) {
   if (!TopList)
 return createStringError(
 "Unexpected error converting internal structure to ctx profile");
-  Writer.writeContextual(*TopList, DC.TotalRootEntryCount);
+
+  ctx_profile::ContextNode *FirstUnhandled = nullptr;
+  for (const auto &U : DC.Unhandled) {
+SerializableCtxRepresentation Unhandled;
+Unhandled.Guid = U.first;
+Unhandled.Counters.insert(Unhandled.Counters.begin(), U.second.begin(),

snehasish wrote:

Repeatedly inserting in the front is inefficient. How about reserve(size), 
append and then reverse the vector?

https://github.com/llvm/llvm-project/pull/131417
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NPM] Port LiveDebugValues to NPM (PR #131563)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -77,36 +78,68 @@ class LiveDebugValues : public MachineFunctionPass {
 AU.setPreservesCFG();
 MachineFunctionPass::getAnalysisUsage(AU);
   }
+};
+
+struct LiveDebugValues {
+  LiveDebugValues();
+  ~LiveDebugValues() = default;
+  bool run(MachineFunction &MF, bool ShouldEmitDebugEntryValues);
 
 private:
   std::unique_ptr InstrRefImpl;
   std::unique_ptr VarLocImpl;
-  TargetPassConfig *TPC = nullptr;
   MachineDominatorTree MDT;
 };
 } // namespace
 
-char LiveDebugValues::ID = 0;
+char LiveDebugValuesLegacy::ID = 0;
 
-char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
+char &llvm::LiveDebugValuesID = LiveDebugValuesLegacy::ID;
 
-INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis", 
false,
-false)
+INITIALIZE_PASS(LiveDebugValuesLegacy, DEBUG_TYPE, "Live DEBUG_VALUE analysis",
+false, false)
 
 /// Default construct and initialize the pass.
-LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
-  initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
+LiveDebugValuesLegacy::LiveDebugValuesLegacy() : MachineFunctionPass(ID) {
+  initializeLiveDebugValuesLegacyPass(*PassRegistry::getPassRegistry());
+}
+
+LiveDebugValues::LiveDebugValues() {
   InstrRefImpl =
   std::unique_ptr(llvm::makeInstrRefBasedLiveDebugValues());
   VarLocImpl = 
std::unique_ptr(llvm::makeVarLocBasedLiveDebugValues());
 }
 
-bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
+PreservedAnalyses
+LiveDebugValuesPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+  if (!LiveDebugValues().run(MF, ShouldEmitDebugEntryValues))
+return PreservedAnalyses::all();
+  auto PA = getMachineFunctionPassPreservedAnalyses();

arsenm wrote:

OK, should fix both PMs in a follow up 

https://github.com/llvm/llvm-project/pull/131563
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] [llvm] [ctxprof] Track unhandled call targets (PR #131417)

2025-03-17 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-pgo

Author: Mircea Trofin (mtrofin)


Changes

Collect profiles for functions we encounter when collecting a contextual 
profile, that are not associated with a call site. This is expected to happen 
for signal handlers, but it also - problematically - currently happens for 
mem{memset|copy|move|set}, which are currently inserted after profile 
instrumentation.

Collecting a "regular" flat profile in these cases would hide the problem - 
that we loose better profile opportunities.

---

Patch is 31.40 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/131417.diff


14 Files Affected:

- (modified) compiler-rt/lib/ctx_profile/CtxInstrContextNode.h (+1) 
- (modified) compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp (+26-8) 
- (modified) compiler-rt/lib/ctx_profile/CtxInstrProfiling.h (+15) 
- (modified) compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp (+1-1) 
- (modified) compiler-rt/test/ctx_profile/TestCases/generate-context.cpp 
(+14-1) 
- (modified) llvm/include/llvm/ProfileData/CtxInstrContextNode.h (+1) 
- (modified) llvm/include/llvm/ProfileData/PGOCtxProfReader.h (+17-9) 
- (modified) llvm/include/llvm/ProfileData/PGOCtxProfWriter.h (+4-2) 
- (modified) llvm/lib/ProfileData/PGOCtxProfReader.cpp (+48-17) 
- (modified) llvm/lib/ProfileData/PGOCtxProfWriter.cpp (+23-4) 
- (modified) llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp (+13) 
- (added) llvm/test/tools/llvm-ctxprof-util/Inputs/valid-with-unhandled.yaml 
(+26) 
- (modified) llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test (+15-6) 
- (modified) llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp (+7-7) 


``diff
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h 
b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
index 55962df57fb58..a176662b5cb3d 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
@@ -121,6 +121,7 @@ class ProfileWriter {
 public:
   virtual void startContextSection() = 0;
   virtual void writeContextual(const ctx_profile::ContextNode &RootNode,
+   const ctx_profile::ContextNode *Unhandled,
uint64_t TotalRootEntryCount) = 0;
   virtual void endContextSection() = 0;
 
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp 
b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index 6ef7076d93e31..26f22926a5704 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -246,22 +246,37 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
 
 ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
  uint32_t NumCounters) {
-  // 1) if we are under a root (regardless if this thread is collecting or not 
a
+
+  // 1) if we are currently collecting a contextual profile, fetch a 
ContextNode
+  // in the `Unhandled` set. We want to do this regardless of 
`ProfilingStarted`
+  // to (hopefully) offset the penalty of creating these contexts to before
+  // profiling.
+  //
+  // 2) if we are under a root (regardless if this thread is collecting or not 
a
   // contextual profile for that root), do not collect a flat profile. We want
   // to keep flat profiles only for activations that can't happen under a root,
   // to avoid confusing profiles. We can, for example, combine flattened and
   // flat profiles meaningfully, as we wouldn't double-count anything.
   //
-  // 2) to avoid lengthy startup, don't bother with flat profiles until the
+  // 3) to avoid lengthy startup, don't bother with flat profiles until the
   // profiling started. We would reset them anyway when profiling starts.
   // HOWEVER. This does lose profiling for message pumps: those functions are
   // entered once and never exit. They should be assumed to be entered before
   // profiling starts - because profiling should start after the server is up
   // and running (which is equivalent to "message pumps are set up").
-  if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
-return TheScratchContext;
-  return markAsScratch(
-  onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
+  ContextRoot *R = __llvm_ctx_profile_current_context_root;
+  if (!R) {
+if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
+  return TheScratchContext;
+else
+  return markAsScratch(
+  onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
+  }
+  auto It = R->Unhandled.insert({Guid, nullptr});
+  if (It.second)
+It.first->second =
+getCallsiteSlow(Guid, &R->FirstUnhandledCalleeNode, NumCounters, 0);
+  return markAsScratch(onContextEnter(*It.first->second));
 }
 
 ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
@@ -396,6 +411,8 @@ void __llvm_ctx_profile_start_collection() {
 

[llvm-branch-commits] [compiler-rt] [llvm] [ctxprof] Track unhandled call targets (PR #131417)

2025-03-17 Thread Mircea Trofin via llvm-branch-commits

https://github.com/mtrofin ready_for_review 
https://github.com/llvm/llvm-project/pull/131417
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] [llvm] [ctxprof] Track unhandled call targets (PR #131417)

2025-03-17 Thread Mircea Trofin via llvm-branch-commits

https://github.com/mtrofin edited 
https://github.com/llvm/llvm-project/pull/131417
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt] [llvm] [ctxprof] Track unhandled call targets (PR #131417)

2025-03-17 Thread Snehasish Kumar via llvm-branch-commits


@@ -246,22 +246,37 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
 
 ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
  uint32_t NumCounters) {
-  // 1) if we are under a root (regardless if this thread is collecting or not 
a
+
+  // 1) if we are currently collecting a contextual profile, fetch a 
ContextNode
+  // in the `Unhandled` set. We want to do this regardless of 
`ProfilingStarted`
+  // to (hopefully) offset the penalty of creating these contexts to before
+  // profiling.
+  //
+  // 2) if we are under a root (regardless if this thread is collecting or not 
a
   // contextual profile for that root), do not collect a flat profile. We want
   // to keep flat profiles only for activations that can't happen under a root,
   // to avoid confusing profiles. We can, for example, combine flattened and
   // flat profiles meaningfully, as we wouldn't double-count anything.
   //
-  // 2) to avoid lengthy startup, don't bother with flat profiles until the
+  // 3) to avoid lengthy startup, don't bother with flat profiles until the
   // profiling started. We would reset them anyway when profiling starts.
   // HOWEVER. This does lose profiling for message pumps: those functions are
   // entered once and never exit. They should be assumed to be entered before
   // profiling starts - because profiling should start after the server is up
   // and running (which is equivalent to "message pumps are set up").
-  if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
-return TheScratchContext;
-  return markAsScratch(
-  onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
+  ContextRoot *R = __llvm_ctx_profile_current_context_root;
+  if (!R) {
+if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
+  return TheScratchContext;
+else
+  return markAsScratch(
+  onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
+  }
+  auto It = R->Unhandled.insert({Guid, nullptr});

snehasish wrote:

`auto& [Iter, Inserted] = R->Unhandled.insert(...)`
So that we can make the first second usage below a bit more readable.

https://github.com/llvm/llvm-project/pull/131417
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] Backport/20.x: [Clang] Fix an incorrect assumption on getTemplatedDecl() (PR #131729)

2025-03-17 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: Younan Zhang (zyn0217)


Changes

This backports d9110858ee because it fixes a regression introduced in 19 and we 
don't want it to persist in 20

---
Full diff: https://github.com/llvm/llvm-project/pull/131729.diff


3 Files Affected:

- (modified) clang/docs/ReleaseNotes.rst (+1) 
- (modified) clang/lib/Sema/SemaAccess.cpp (+2-2) 
- (modified) clang/test/SemaCXX/concept-crash-on-diagnostic.cpp (+12) 


``diff
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 02292c10e6964..dc63b5213c546 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1059,6 +1059,7 @@ Bug Fixes to C++ Support
   corresponding to a pack parameter (#GH124715)
 - Clang is now better at keeping track of friend function template instance 
contexts. (#GH55509)
 - Fixed an integer overflow bug in computing template parameter depths when 
synthesizing CTAD guides. (#GH128691)
+- Fixed an incorrect pointer access when checking access-control on concepts. 
(#GH131530)
 
 Bug Fixes to AST Handling
 ^
diff --git a/clang/lib/Sema/SemaAccess.cpp b/clang/lib/Sema/SemaAccess.cpp
index f79d9a758e7af..6813786df3fc4 100644
--- a/clang/lib/Sema/SemaAccess.cpp
+++ b/clang/lib/Sema/SemaAccess.cpp
@@ -1518,8 +1518,8 @@ void Sema::HandleDelayedAccessCheck(DelayedDiagnostic 
&DD, Decl *D) {
   } else if (FunctionDecl *FN = dyn_cast(D)) {
 DC = FN;
   } else if (TemplateDecl *TD = dyn_cast(D)) {
-if (isa(TD->getTemplatedDecl()))
-  DC = cast(TD->getTemplatedDecl());
+if (auto *D = dyn_cast_if_present(TD->getTemplatedDecl()))
+  DC = D;
   } else if (auto *RD = dyn_cast(D)) {
 DC = RD;
   }
diff --git a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp 
b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
index 71e55c8290ee4..c38f075de 100644
--- a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
+++ b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
@@ -36,3 +36,15 @@ void function() {
 // expected-note@#4 {{candidate template ignored: constraints not satisfied 
[with IteratorL = Object *, IteratorR = Object *]}}
 // We don't know exactly the substituted type for `lhs == rhs`, thus a 
placeholder 'expr-type' is emitted.
 // expected-note@#3 {{because 'convertible_to' would be 
invalid}}
+
+namespace GH131530 {
+
+class foo {
+  struct bar {}; // expected-note {{implicitly declared private}}
+};
+
+template 
+concept is_foo_concept = __is_same(foo::bar, T);
+// expected-error@-1 {{'bar' is a private member of 'GH131530::foo'}}
+
+}

``




https://github.com/llvm/llvm-project/pull/131729
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] release/20.x: Reduce memory usage in AST parent map generation by lazily checking if nodes have been seen (#129934) (PR #131209)

2025-03-17 Thread Tom Stellard via llvm-branch-commits

https://github.com/tstellar closed 
https://github.com/llvm/llvm-project/pull/131209
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [OpenMP][MLIR] Refactor code related to collecting privatizer info into a shared util (PR #131582)

2025-03-17 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-mlir-llvm

Author: Kareem Ergawy (ergawy)


Changes

Moves code needed to collect info about delayed privatizers into a shared util 
instread of repeating the same patter across all relevant constructs.

---

Patch is 22.93 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/131582.diff


1 Files Affected:

- (modified) 
mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+104-147) 


``diff
diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 17d0a7007729f..315c6b8ccc553 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -696,20 +696,42 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase 
&builder,
   return success();
 }
 
-/// Populates `privatizations` with privatization declarations used for the
-/// given op.
-template 
-static void collectPrivatizationDecls(
-OP op, SmallVectorImpl &privatizations) {
-  std::optional attr = op.getPrivateSyms();
-  if (!attr)
-return;
+/// A util to collect info needed to convert delayed privatizers from MLIR to
+/// LLVM.
+struct PrivateVarsInfo {
+  template 
+  PrivateVarsInfo(OP op)
+  : privateBlockArgs(
+cast(*op).getPrivateBlockArgs()) {
+mlirPrivateVars.reserve(privateBlockArgs.size());
+llvmPrivateVars.reserve(privateBlockArgs.size());
+collectPrivatizationDecls(op, privateDecls);
 
-  privatizations.reserve(privatizations.size() + attr->size());
-  for (auto symbolRef : attr->getAsRange()) {
-privatizations.push_back(findPrivatizer(op, symbolRef));
+for (mlir::Value privateVar : op.getPrivateVars())
+  mlirPrivateVars.push_back(privateVar);
   }
-}
+
+  MutableArrayRef privateBlockArgs;
+  SmallVector mlirPrivateVars;
+  SmallVector llvmPrivateVars;
+  SmallVector privateDecls;
+
+private:
+  /// Populates `privatizations` with privatization declarations used for the
+  /// given op.
+  template 
+  static void collectPrivatizationDecls(
+  OP op, SmallVectorImpl &privatizations) {
+std::optional attr = op.getPrivateSyms();
+if (!attr)
+  return;
+
+privatizations.reserve(privatizations.size() + attr->size());
+for (auto symbolRef : attr->getAsRange()) {
+  privatizations.push_back(findPrivatizer(op, symbolRef));
+}
+  }
+};
 
 /// Populates `reductions` with reduction declarations used in the given op.
 template 
@@ -1384,19 +1406,18 @@ static llvm::Expected initPrivateVar(
 static llvm::Error
 initPrivateVars(llvm::IRBuilderBase &builder,
 LLVM::ModuleTranslation &moduleTranslation,
-MutableArrayRef privateBlockArgs,
-MutableArrayRef privateDecls,
-MutableArrayRef mlirPrivateVars,
-llvm::SmallVectorImpl &llvmPrivateVars,
+PrivateVarsInfo &privateVarsInfo,
 llvm::DenseMap *mappedPrivateVars = nullptr) {
-  if (privateBlockArgs.empty())
+  if (privateVarsInfo.privateBlockArgs.empty())
 return llvm::Error::success();
 
   llvm::BasicBlock *privInitBlock = splitBB(builder, true, "omp.private.init");
   setInsertPointForPossiblyEmptyBlock(builder, privInitBlock);
 
   for (auto [idx, zip] : llvm::enumerate(llvm::zip_equal(
-   privateDecls, mlirPrivateVars, privateBlockArgs, llvmPrivateVars))) 
{
+   privateVarsInfo.privateDecls, privateVarsInfo.mlirPrivateVars,
+   privateVarsInfo.privateBlockArgs,
+   privateVarsInfo.llvmPrivateVars))) {
 auto [privDecl, mlirPrivVar, blockArg, llvmPrivateVar] = zip;
 llvm::Expected privVarOrErr = initPrivateVar(
 builder, moduleTranslation, privDecl, mlirPrivVar, blockArg,
@@ -1420,10 +1441,7 @@ initPrivateVars(llvm::IRBuilderBase &builder,
 static llvm::Expected
 allocatePrivateVars(llvm::IRBuilderBase &builder,
 LLVM::ModuleTranslation &moduleTranslation,
-MutableArrayRef privateBlockArgs,
-MutableArrayRef privateDecls,
-MutableArrayRef mlirPrivateVars,
-llvm::SmallVectorImpl &llvmPrivateVars,
+PrivateVarsInfo &privateVarsInfo,
 const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
 llvm::DenseMap *mappedPrivateVars = nullptr) 
{
   // Allocate private vars
@@ -1449,8 +1467,9 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
->getDataLayout()
.getProgramAddressSpace();
 
-  for (auto [privDecl, mlirPrivVar, blockArg] :
-   llvm::zip_equal(privateDecls, mlirPrivateVars, privateBlockArgs)) {
+  for (auto [privDecl, mlirPrivVar, blockArg] : llvm::zip_equal(
+   privateVarsInfo.privateDecls, privateVarsInfo.mlirPrivateVars,
+  

[llvm-branch-commits] [llvm] AMDGPU: Replace unused update.dpp inputs with poison instead of undef (PR #131287)

2025-03-17 Thread Nuno Lopes via llvm-branch-commits

https://github.com/nunoplopes approved this pull request.


https://github.com/llvm/llvm-project/pull/131287
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [OpenMP][MLIR] Refactor code related to collecting privatizer info into a shared util (PR #131582)

2025-03-17 Thread Kareem Ergawy via llvm-branch-commits

https://github.com/ergawy created 
https://github.com/llvm/llvm-project/pull/131582

Moves code needed to collect info about delayed privatizers into a shared util 
instread of repeating the same patter across all relevant constructs.

>From e237f8b5c88d9a3106398cf0eac2a5fba4cdb6e4 Mon Sep 17 00:00:00 2001
From: ergawy 
Date: Mon, 17 Mar 2025 03:37:00 -0500
Subject: [PATCH] [OpenMP][MLIR] Refactor code related to collecting privatizer
 info into a shared util

Moves code needed to collect info about delayed privatizers into a
shared util instread of repeating the same patter across all relevant
constructs.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp  | 251 --
 1 file changed, 104 insertions(+), 147 deletions(-)

diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 17d0a7007729f..315c6b8ccc553 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -696,20 +696,42 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase 
&builder,
   return success();
 }
 
-/// Populates `privatizations` with privatization declarations used for the
-/// given op.
-template 
-static void collectPrivatizationDecls(
-OP op, SmallVectorImpl &privatizations) {
-  std::optional attr = op.getPrivateSyms();
-  if (!attr)
-return;
+/// A util to collect info needed to convert delayed privatizers from MLIR to
+/// LLVM.
+struct PrivateVarsInfo {
+  template 
+  PrivateVarsInfo(OP op)
+  : privateBlockArgs(
+cast(*op).getPrivateBlockArgs()) {
+mlirPrivateVars.reserve(privateBlockArgs.size());
+llvmPrivateVars.reserve(privateBlockArgs.size());
+collectPrivatizationDecls(op, privateDecls);
 
-  privatizations.reserve(privatizations.size() + attr->size());
-  for (auto symbolRef : attr->getAsRange()) {
-privatizations.push_back(findPrivatizer(op, symbolRef));
+for (mlir::Value privateVar : op.getPrivateVars())
+  mlirPrivateVars.push_back(privateVar);
   }
-}
+
+  MutableArrayRef privateBlockArgs;
+  SmallVector mlirPrivateVars;
+  SmallVector llvmPrivateVars;
+  SmallVector privateDecls;
+
+private:
+  /// Populates `privatizations` with privatization declarations used for the
+  /// given op.
+  template 
+  static void collectPrivatizationDecls(
+  OP op, SmallVectorImpl &privatizations) {
+std::optional attr = op.getPrivateSyms();
+if (!attr)
+  return;
+
+privatizations.reserve(privatizations.size() + attr->size());
+for (auto symbolRef : attr->getAsRange()) {
+  privatizations.push_back(findPrivatizer(op, symbolRef));
+}
+  }
+};
 
 /// Populates `reductions` with reduction declarations used in the given op.
 template 
@@ -1384,19 +1406,18 @@ static llvm::Expected initPrivateVar(
 static llvm::Error
 initPrivateVars(llvm::IRBuilderBase &builder,
 LLVM::ModuleTranslation &moduleTranslation,
-MutableArrayRef privateBlockArgs,
-MutableArrayRef privateDecls,
-MutableArrayRef mlirPrivateVars,
-llvm::SmallVectorImpl &llvmPrivateVars,
+PrivateVarsInfo &privateVarsInfo,
 llvm::DenseMap *mappedPrivateVars = nullptr) {
-  if (privateBlockArgs.empty())
+  if (privateVarsInfo.privateBlockArgs.empty())
 return llvm::Error::success();
 
   llvm::BasicBlock *privInitBlock = splitBB(builder, true, "omp.private.init");
   setInsertPointForPossiblyEmptyBlock(builder, privInitBlock);
 
   for (auto [idx, zip] : llvm::enumerate(llvm::zip_equal(
-   privateDecls, mlirPrivateVars, privateBlockArgs, llvmPrivateVars))) 
{
+   privateVarsInfo.privateDecls, privateVarsInfo.mlirPrivateVars,
+   privateVarsInfo.privateBlockArgs,
+   privateVarsInfo.llvmPrivateVars))) {
 auto [privDecl, mlirPrivVar, blockArg, llvmPrivateVar] = zip;
 llvm::Expected privVarOrErr = initPrivateVar(
 builder, moduleTranslation, privDecl, mlirPrivVar, blockArg,
@@ -1420,10 +1441,7 @@ initPrivateVars(llvm::IRBuilderBase &builder,
 static llvm::Expected
 allocatePrivateVars(llvm::IRBuilderBase &builder,
 LLVM::ModuleTranslation &moduleTranslation,
-MutableArrayRef privateBlockArgs,
-MutableArrayRef privateDecls,
-MutableArrayRef mlirPrivateVars,
-llvm::SmallVectorImpl &llvmPrivateVars,
+PrivateVarsInfo &privateVarsInfo,
 const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
 llvm::DenseMap *mappedPrivateVars = nullptr) 
{
   // Allocate private vars
@@ -1449,8 +1467,9 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
->getDataLayout()
.getProgramAddressSpace();
 
-  for (auto [privDecl, mlirP

[llvm-branch-commits] [llvm] [AMDGPU][Legalizer] Widen i16 G_SEXT_INREG (PR #131308)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131308

>From cdfba0ea7ab0fcb60d632a25433b18b421022c25 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Wed, 5 Mar 2025 13:41:04 +0100
Subject: [PATCH 1/2] [AMDGPU][Legalizer] Widen i16 G_SEXT_INREG

It's better to widen them to avoid it being lowered into a G_ASHR + G_SHL. With 
this change we just extend to i32 then trunc the result.
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   3 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |   7 +-
 .../AMDGPU/GlobalISel/legalize-abs.mir|   8 +-
 .../AMDGPU/GlobalISel/legalize-ashr.mir   |  20 +--
 .../AMDGPU/GlobalISel/legalize-sext-inreg.mir | 155 +++---
 .../AMDGPU/GlobalISel/legalize-sext.mir   | 101 ++--
 .../AMDGPU/GlobalISel/legalize-smax.mir   |  33 +++-
 .../AMDGPU/GlobalISel/legalize-smin.mir   |  33 +++-
 .../AMDGPU/GlobalISel/legalize-smulh.mir  | 132 +++
 .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll |  45 ++---
 .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll   | 130 ++-
 11 files changed, 299 insertions(+), 368 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b3a8183beeacf..6e611ebb4b625 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2009,7 +2009,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const 
GCNSubtarget &ST_,
   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
   // RegBankSelect.
   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
-.legalFor({{S32}, {S64}});
+.legalFor({{S32}, {S64}})
+.widenScalarIf(typeIs(0, S16), widenScalarOrEltToNextPow2(0, 32));
 
   if (ST.hasVOP3PInsts()) {
 SextInReg.lowerFor({{V2S16}})
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 493e8cef63890..f81d7f1c300b8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -17,8 +17,7 @@ define i8 @v_ashr_i8(i8 %value, i8 %amount) {
 ; GFX8-LABEL: v_ashr_i8:
 ; GFX8:   ; %bb.0:
 ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX8-NEXT:v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i8:
@@ -49,8 +48,8 @@ define i8 @v_ashr_i8_7(i8 %value) {
 ; GFX8-LABEL: v_ashr_i8_7:
 ; GFX8:   ; %bb.0:
 ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:v_ashrrev_i16_e32 v0, 15, v0
+; GFX8-NEXT:v_mov_b32_e32 v1, 7
+; GFX8-NEXT:v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i8_7:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir
index a9fe80eb47e76..2b911b2dce697 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir
@@ -144,11 +144,9 @@ body: |
 ; VI: liveins: $vgpr0
 ; VI-NEXT: {{  $}}
 ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
-; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
-; VI-NEXT: [[ABS:%[0-9]+]]:_(s16) = G_ABS [[ASHR]]
+; VI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+; VI-NEXT: [[ABS:%[0-9]+]]:_(s16) = G_ABS [[TRUNC]]
 ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ABS]](s16)
 ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
index f4aaab745e03b..53905a2f49dd0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
@@ -319,12 +319,10 @@ body: |
 ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
 ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
 ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
-; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C1]](s16)
-; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C1]](s16)
-; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[ASHR]], [

[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Allow forming s16 U/SBFX pre-regbankselect (PR #131309)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131309

>From 2dc7126ab1abb6aa49aaf263a0591759130ddca5 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Wed, 12 Mar 2025 09:43:15 +0100
Subject: [PATCH] [AMDGPU][GlobalISel] Allow forming s16 U/SBFX
 pre-regbankselect

Make s16 G_U/SBFX legal and widen them in RegBankSelect.
This allows the set of BFX formation combines to work on s16 types.
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   9 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  33 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 645 --
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 380 ---
 .../AMDGPU/GlobalISel/legalize-sbfx.mir   |  26 +-
 .../AMDGPU/GlobalISel/legalize-ubfx.mir   |  27 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |  27 +-
 7 files changed, 503 insertions(+), 644 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index cfb5c3b3006f0..ab900157d2095 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2069,10 +2069,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const 
GCNSubtarget &ST_,
   .minScalar(0, S32)
   .lower();
 
+  // Only {S32, S32} or {S32, S64} should ever reach codegen.
+  // We allow S/UBFX for S16 so the combiner can form them before
+  // RegBankSelect, and RegBankSelect will then legalize them correctly.
   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
-  .legalFor({{S32, S32}, {S64, S32}})
-  .clampScalar(1, S32, S32)
-  .clampScalar(0, S32, S64)
+  .legalFor({{S16, S16}, {S32, S32}, {S64, S32}})
+  .clampScalar(1, S16, S32)
+  .clampScalar(0, S16, S64)
   .widenScalarToNextPow2(0)
   .scalarize(0);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b46fc7d9c752a..1c9d67826186f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1485,7 +1485,9 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
   Register DstReg = MI.getOperand(0).getReg();
   LLT Ty = MRI.getType(DstReg);
 
+  const LLT S64 = LLT::scalar(64);
   const LLT S32 = LLT::scalar(32);
+  const LLT S16 = LLT::scalar(16);
 
   unsigned FirstOpnd = isa(MI) ? 2 : 1;
   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
@@ -1495,6 +1497,18 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
   const RegisterBank *DstBank =
 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   if (DstBank == &AMDGPU::VGPRRegBank) {
+if (Ty == S16) {
+  ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
+  B.setInsertPt(B.getMBB(), MI);
+  LegalizerHelper Helper(B.getMF(), ApplyBank, B);
+
+  Helper.widenScalarDst(MI, S32);
+  Helper.widenScalarSrc(MI, S32, 1, AMDGPU::G_ANYEXT);
+  Helper.widenScalarSrc(MI, S32, 2, AMDGPU::G_ZEXT);
+  Helper.widenScalarSrc(MI, S32, 3, AMDGPU::G_ZEXT);
+  return true;
+}
+
 if (Ty == S32)
   return true;
 
@@ -1554,6 +1568,11 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
 
   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
 
+  if (Ty == S16) {
+OffsetReg = B.buildAnyExtOrTrunc(S32, OffsetReg).getReg(0);
+WidthReg = B.buildAnyExtOrTrunc(S32, WidthReg).getReg(0);
+  }
+
   // Ensure the high bits are clear to insert the offset.
   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6));
   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
@@ -1568,13 +1587,21 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
 
   // TODO: It might be worth using a pseudo here to avoid scc clobber and
   // register class constraints.
-  unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
- (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+  unsigned Opc = (Ty != S64) ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
+ : (Signed ? AMDGPU::S_BFE_I64 : 
AMDGPU::S_BFE_U64);
 
-  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+  Register BFEDst = DstReg;
+  if (Ty == S16) {
+BFEDst = MRI.createGenericVirtualRegister(S32);
+MRI.setRegBank(BFEDst, AMDGPU::SGPRRegBank);
+  }
+  auto MIB = B.buildInstr(Opc, {BFEDst}, {SrcReg, MergedInputs});
   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
 llvm_unreachable("failed to constrain BFE");
 
+  if (BFEDst != DstReg)
+B.buildZExtOrTrunc(DstReg, BFEDst);
+
   MI.eraseFromParent();
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 07fcb02d98649..d2b600b04f9fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fsh

[llvm-branch-commits] [llvm] [AMDGPU] Precommit si-fold-bitmask.mir (PR #131310)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131310

>From d4b257d1b34b51018f51546974bffdc2ea56433d Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Fri, 14 Mar 2025 10:00:21 +0100
Subject: [PATCH] [AMDGPU] Precommit si-fold-bitmask.mir

---
 llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir | 429 ++
 1 file changed, 429 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir

diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir 
b/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir
new file mode 100644
index 0..1edf970591179
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir
@@ -0,0 +1,429 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=si-fold-operands 
-verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s
+
+# Test supported instructions
+
+---
+name: v_ashr_i32_e64__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_ashr_i32_e64__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_ASHR_I32_e64 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_ASHR_I32_e64 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: v_lshr_b32_e64__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_lshr_b32_e64__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_LSHR_B32_e64 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_LSHR_B32_e64 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: v_lshr_b32_e32__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_lshr_b32_e32__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_LSHR_B32_e32 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_LSHR_B32_e32 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: v_lshl_b32_e64__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_lshl_b32_e64__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_LSHL_B32_e64 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_LSHL_B32_e64 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: s_lshl_b32__s_and_b32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $sgpr0, $sgpr1
+
+; GCN-LABEL: name: s_lshl_b32__s_and_b32
+; GCN: liveins: $sgpr0, $sgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:sgpr_32 = COPY $sgpr0
+; GCN-NEXT: %shift:sgpr_32 = COPY $sgpr1
+; GCN-NEXT: %shiftmask:sgpr_32 = S_AND_B32 65535, %shift, implicit-def $scc
+; GCN-NEXT: %ret:sgpr_32 = S_LSHL_B32 %src, %shiftmask, implicit-def $scc
+; GCN-NEXT: $sgpr0 = COPY %ret
+%src:sgpr_32 = COPY $sgpr0
+%shift:sgpr_32 = COPY $sgpr1
+%shiftmask:sgpr_32 = S_AND_B32 65535, %shift, implicit-def $scc
+%ret:sgpr_32 = S_LSHL_B32 %src, %shiftmask, implicit-def $scc
+$sgpr0 = COPY %ret
+...
+
+---
+name: s_lshr_b32__s_and_b32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $sgpr0, $sgpr1
+
+; GCN-LABEL: name: s_lshr_b32__s_and_b32
+; GCN: liveins: $sgpr0, $sgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:sgpr_32 = COPY $sgpr0
+; GCN-NEXT: %shift:sgpr_

[llvm-branch-commits] [llvm] [AMDGPU][SIFoldOperands] Fold some redundant bitmasks (PR #131311)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131311

>From 17e13825f173be8fd67494f13f002f35d93e357f Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Fri, 14 Mar 2025 10:05:19 +0100
Subject: [PATCH 1/2] [AMDGPU][SIFoldOperands] Fold some redundant bitmasks

Instructions like shifts only read some of the bits of the shift amount 
operand, between 4 and 6 bits.
If the source operand is being masked, we can just ignore the mask.

Effects are minimal right now but this will kick in more once we disable 
uniform i16 operation widening in CGP.
With that disabled, we get more i16 shift amounts
that are zext'd and without this we'd end up with
more `s_and_b32 s1, s1, 0x` in the output.

Ideally ISel should handle this but it's proving difficult to get the patterns 
right, and after a few hours of trying I just decided to go with this as it's 
simple enough and it "just works" for this purpose.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp |  97 +++-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 201 -
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 207 --
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll|   6 +-
 llvm/test/CodeGen/AMDGPU/constrained-shift.ll |   1 -
 llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir |  26 +--
 8 files changed, 303 insertions(+), 251 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp 
b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index cc15dd7cb495c..5f666e10b5cb7 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -131,6 +131,7 @@ class SIFoldOperandsImpl {
   std::optional getImmOrMaterializedImm(MachineOperand &Op) const;
   bool tryConstantFoldOp(MachineInstr *MI) const;
   bool tryFoldCndMask(MachineInstr &MI) const;
+  bool tryFoldBitMask(MachineInstr &MI) const;
   bool tryFoldZeroHighBits(MachineInstr &MI) const;
   bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
 
@@ -1447,6 +1448,99 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr 
&MI) const {
   return true;
 }
 
+static bool getBitsReadByInst(unsigned Opc, unsigned &NumBitsRead,
+  unsigned &OpIdx) {
+  switch (Opc) {
+  case AMDGPU::V_ASHR_I32_e64:
+  case AMDGPU::V_ASHR_I32_e32:
+  case AMDGPU::V_LSHR_B32_e64:
+  case AMDGPU::V_LSHR_B32_e32:
+  case AMDGPU::V_LSHL_B32_e64:
+  case AMDGPU::V_LSHL_B32_e32:
+  case AMDGPU::S_LSHL_B32:
+  case AMDGPU::S_LSHR_B32:
+  case AMDGPU::S_ASHR_I32:
+NumBitsRead = 5;
+OpIdx = 2;
+return true;
+  case AMDGPU::S_LSHL_B64:
+  case AMDGPU::S_LSHR_B64:
+  case AMDGPU::S_ASHR_I64:
+NumBitsRead = 6;
+OpIdx = 2;
+return true;
+  case AMDGPU::V_LSHLREV_B32_e64:
+  case AMDGPU::V_LSHLREV_B32_e32:
+  case AMDGPU::V_LSHRREV_B32_e64:
+  case AMDGPU::V_LSHRREV_B32_e32:
+  case AMDGPU::V_ASHRREV_I32_e64:
+  case AMDGPU::V_ASHRREV_I32_e32:
+NumBitsRead = 5;
+OpIdx = 1;
+return true;
+  default:
+return false;
+  }
+}
+
+static bool isAndBitMaskRedundant(MachineInstr &MI, unsigned BitsNeeded,
+unsigned &SrcOp) {
+  MachineOperand *RegOp = &MI.getOperand(1);
+  MachineOperand *ImmOp = &MI.getOperand(2);
+
+  if (!RegOp->isReg() || !ImmOp->isImm()) {
+if (ImmOp->isReg() && RegOp->isImm())
+  std::swap(RegOp, ImmOp);
+else
+  return false;
+  }
+
+  SrcOp = RegOp->getOperandNo();
+
+  const unsigned BitMask = maskTrailingOnes(BitsNeeded);
+  return (ImmOp->getImm() & BitMask) == BitMask;
+}
+
+bool SIFoldOperandsImpl::tryFoldBitMask(MachineInstr &MI) const {
+  unsigned NumBitsRead = 0;
+  unsigned OpIdx = 0;
+  if (!getBitsReadByInst(MI.getOpcode(), NumBitsRead, OpIdx))
+return false;
+
+  MachineOperand &Op = MI.getOperand(OpIdx);
+  if (!Op.isReg())
+return false;
+
+  Register OpReg = Op.getReg();
+  if (OpReg.isPhysical())
+return false;
+
+  MachineInstr *OpDef = MRI->getVRegDef(OpReg);
+  if (!OpDef)
+return false ;
+
+  LLVM_DEBUG(dbgs() << "tryFoldBitMask: " << MI << "\tOpIdx:" << OpIdx << ", 
NumBitsRead:" << NumBitsRead << "\n");
+
+  unsigned ReplaceWith;
+  switch (OpDef->getOpcode()) {
+  // TODO: add more opcodes?
+  case AMDGPU::S_AND_B32:
+  case AMDGPU::V_AND_B32_e32:
+  case AMDGPU::V_AND_B32_e64:
+if (!isAndBitMaskRedundant(*OpDef, NumBitsRead, ReplaceWith))
+  return false;
+break;
+  default:
+return false;
+  }
+
+  MachineOperand &ReplaceWithOp = OpDef->getOperand(ReplaceWith);
+  LLVM_DEBUG(dbgs() << "\treplacing operand with:" << ReplaceWithOp << "\n");
+
+  MI.getOperand(OpIdx).setReg(ReplaceWithOp.getReg());
+  return true;
+}
+
 bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
   if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
   MI.getOpcode() != AMDGPU::V_AND_B32_e32)
@@ -1458,7 +1552,7 @@ bool SIFoldOperands

[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Combine (sext (trunc (sext_in_reg x))) (PR #131312)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131312

>From 9fabf931105e1cf86cf69f90bd5c62068846c3e1 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Fri, 14 Mar 2025 10:34:51 +0100
Subject: [PATCH] [AMDGPU][GlobalISel] Combine (sext (trunc (sext_in_reg x)))

This is a bit of an akward pattern that can come up as a result
of legalization and then widening of i16 operations to i32 in RegBankSelect
on AMDGPU.

This quick combine avoids redundant patterns like
```
s_sext_i32_i8 s0, s0
s_sext_i32_i16 s0, s0
s_ashr_i32 s0, s0, s1
```

With this the second sext is removed as it's redundant.
---
 .../include/llvm/Target/GlobalISel/Combine.td | 12 ++-
 .../combine-sext-trunc-sextinreg.mir  | 86 +++
 .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll | 78 -
 3 files changed, 113 insertions(+), 63 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir

diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td 
b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 3590ab221ad44..9727b86b4be8b 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -258,6 +258,14 @@ def sext_trunc_sextload : GICombineRule<
  [{ return Helper.matchSextTruncSextLoad(*${d}); }]),
   (apply [{ Helper.applySextTruncSextLoad(*${d}); }])>;
 
+def sext_trunc_sextinreg : GICombineRule<
+  (defs root:$dst),
+  (match (G_SEXT_INREG $sir, $src, $width),
+ (G_TRUNC $trunc, $sir),
+ (G_SEXT $dst, $trunc),
+ [{ return (MRI.getType(${trunc}.getReg()).getScalarSizeInBits() >= 
${width}.getImm()); }]),
+  (apply (GIReplaceReg $dst, $sir))>;
+
 def sext_inreg_of_load_matchdata : GIDefMatchData<"std::tuple">;
 def sext_inreg_of_load : GICombineRule<
   (defs root:$root, sext_inreg_of_load_matchdata:$matchinfo),
@@ -1896,7 +1904,9 @@ def cast_of_cast_combines: GICombineGroup<[
   sext_of_anyext,
   anyext_of_anyext,
   anyext_of_zext,
-  anyext_of_sext
+  anyext_of_sext,
+
+  sext_trunc_sextinreg
 ]>;
 
 def cast_combines: GICombineGroup<[
diff --git 
a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir
new file mode 100644
index 0..d41e5b172efc2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir
@@ -0,0 +1,86 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 
-run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | 
FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 
-run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: trunc_s16_inreg_8
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: trunc_s16_inreg_8
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
+; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 8
+%trunc:_(s16) = G_TRUNC %inreg
+%sext:_(s32) = G_SEXT %trunc
+$vgpr0 = COPY %sext
+...
+
+---
+name: trunc_s16_inreg_16
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: trunc_s16_inreg_16
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 16
+; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 16
+%trunc:_(s16) = G_TRUNC %inreg
+%sext:_(s32) = G_SEXT %trunc
+$vgpr0 = COPY %sext
+...
+
+---
+name: trunc_s8_inreg_16
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: trunc_s8_inreg_16
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 16
+; CHECK-NEXT: %trunc:_(s8) = G_TRUNC %inreg(s32)
+; CHECK-NEXT: %sext:_(s32) = G_SEXT %trunc(s8)
+; CHECK-NEXT: $vgpr0 = COPY %sext(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 16
+%trunc:_(s8) = G_TRUNC %inreg
+%sext:_(s32) = G_SEXT %trunc
+$vgpr0 = COPY %sext
+...
+
+# TODO?: We could handle this by inserting a trunc, but I'm not sure how 
useful that'd be.
+---
+name: mismatching_types
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: mismatching_types
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
+; CHECK-NEXT: %trunc:_(s8) = G_TRUNC %inreg(s32)
+; CHECK-NEXT: %sext:_(s16

[llvm-branch-commits] [mlir] [OpenMP][MLIR] Refactor code related to collecting privatizer info into a shared util (PR #131582)

2025-03-17 Thread Kareem Ergawy via llvm-branch-commits

https://github.com/ergawy updated 
https://github.com/llvm/llvm-project/pull/131582

>From 19e8b682071fcdd3fb0b26383c5e0dc20a357da8 Mon Sep 17 00:00:00 2001
From: ergawy 
Date: Mon, 17 Mar 2025 03:37:00 -0500
Subject: [PATCH] [OpenMP][MLIR] Refactor code related to collecting privatizer
 info into a shared util

Moves code needed to collect info about delayed privatizers into a
shared util instread of repeating the same patter across all relevant
constructs.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp  | 257 --
 1 file changed, 107 insertions(+), 150 deletions(-)

diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 537558a83cb36..aff874643d41f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -696,20 +696,42 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase 
&builder,
   return success();
 }
 
-/// Populates `privatizations` with privatization declarations used for the
-/// given op.
-template 
-static void collectPrivatizationDecls(
-OP op, SmallVectorImpl &privatizations) {
-  std::optional attr = op.getPrivateSyms();
-  if (!attr)
-return;
+/// A util to collect info needed to convert delayed privatizers from MLIR to
+/// LLVM.
+struct PrivateVarsInfo {
+  template 
+  PrivateVarsInfo(OP op)
+  : privateBlockArgs(
+cast(*op).getPrivateBlockArgs()) {
+mlirPrivateVars.reserve(privateBlockArgs.size());
+llvmPrivateVars.reserve(privateBlockArgs.size());
+collectPrivatizationDecls(op, privateDecls);
 
-  privatizations.reserve(privatizations.size() + attr->size());
-  for (auto symbolRef : attr->getAsRange()) {
-privatizations.push_back(findPrivatizer(op, symbolRef));
+for (mlir::Value privateVar : op.getPrivateVars())
+  mlirPrivateVars.push_back(privateVar);
   }
-}
+
+  MutableArrayRef privateBlockArgs;
+  SmallVector mlirPrivateVars;
+  SmallVector llvmPrivateVars;
+  SmallVector privateDecls;
+
+private:
+  /// Populates `privatizations` with privatization declarations used for the
+  /// given op.
+  template 
+  static void collectPrivatizationDecls(
+  OP op, SmallVectorImpl &privatizations) {
+std::optional attr = op.getPrivateSyms();
+if (!attr)
+  return;
+
+privatizations.reserve(privatizations.size() + attr->size());
+for (auto symbolRef : attr->getAsRange()) {
+  privatizations.push_back(findPrivatizer(op, symbolRef));
+}
+  }
+};
 
 /// Populates `reductions` with reduction declarations used in the given op.
 template 
@@ -1384,19 +1406,18 @@ static llvm::Expected initPrivateVar(
 static llvm::Error
 initPrivateVars(llvm::IRBuilderBase &builder,
 LLVM::ModuleTranslation &moduleTranslation,
-MutableArrayRef privateBlockArgs,
-MutableArrayRef privateDecls,
-MutableArrayRef mlirPrivateVars,
-llvm::SmallVectorImpl &llvmPrivateVars,
+PrivateVarsInfo &privateVarsInfo,
 llvm::DenseMap *mappedPrivateVars = nullptr) {
-  if (privateBlockArgs.empty())
+  if (privateVarsInfo.privateBlockArgs.empty())
 return llvm::Error::success();
 
   llvm::BasicBlock *privInitBlock = splitBB(builder, true, "omp.private.init");
   setInsertPointForPossiblyEmptyBlock(builder, privInitBlock);
 
   for (auto [idx, zip] : llvm::enumerate(llvm::zip_equal(
-   privateDecls, mlirPrivateVars, privateBlockArgs, llvmPrivateVars))) 
{
+   privateVarsInfo.privateDecls, privateVarsInfo.mlirPrivateVars,
+   privateVarsInfo.privateBlockArgs,
+   privateVarsInfo.llvmPrivateVars))) {
 auto [privDecl, mlirPrivVar, blockArg, llvmPrivateVar] = zip;
 llvm::Expected privVarOrErr = initPrivateVar(
 builder, moduleTranslation, privDecl, mlirPrivVar, blockArg,
@@ -1420,10 +1441,7 @@ initPrivateVars(llvm::IRBuilderBase &builder,
 static llvm::Expected
 allocatePrivateVars(llvm::IRBuilderBase &builder,
 LLVM::ModuleTranslation &moduleTranslation,
-MutableArrayRef privateBlockArgs,
-MutableArrayRef privateDecls,
-MutableArrayRef mlirPrivateVars,
-llvm::SmallVectorImpl &llvmPrivateVars,
+PrivateVarsInfo &privateVarsInfo,
 const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
 llvm::DenseMap *mappedPrivateVars = nullptr) 
{
   // Allocate private vars
@@ -1449,8 +1467,9 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
->getDataLayout()
.getProgramAddressSpace();
 
-  for (auto [privDecl, mlirPrivVar, blockArg] :
-   llvm::zip_equal(privateDecls, mlirPrivateVars, privateBlockArgs)) {
+  for (auto [privDecl, mlirPrivVar, blockArg] : llvm::

[llvm-branch-commits] [llvm] [CodeGen][NPM] Port LiveDebugValues to NPM (PR #131563)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -77,36 +78,68 @@ class LiveDebugValues : public MachineFunctionPass {
 AU.setPreservesCFG();
 MachineFunctionPass::getAnalysisUsage(AU);
   }
+};
+
+struct LiveDebugValues {
+  LiveDebugValues();
+  ~LiveDebugValues() = default;
+  bool run(MachineFunction &MF, bool ShouldEmitDebugEntryValues);
 
 private:
   std::unique_ptr InstrRefImpl;
   std::unique_ptr VarLocImpl;
-  TargetPassConfig *TPC = nullptr;
   MachineDominatorTree MDT;
 };
 } // namespace
 
-char LiveDebugValues::ID = 0;
+char LiveDebugValuesLegacy::ID = 0;
 
-char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
+char &llvm::LiveDebugValuesID = LiveDebugValuesLegacy::ID;
 
-INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis", 
false,
-false)
+INITIALIZE_PASS(LiveDebugValuesLegacy, DEBUG_TYPE, "Live DEBUG_VALUE analysis",
+false, false)
 
 /// Default construct and initialize the pass.
-LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
-  initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
+LiveDebugValuesLegacy::LiveDebugValuesLegacy() : MachineFunctionPass(ID) {
+  initializeLiveDebugValuesLegacyPass(*PassRegistry::getPassRegistry());
+}
+
+LiveDebugValues::LiveDebugValues() {
   InstrRefImpl =
   std::unique_ptr(llvm::makeInstrRefBasedLiveDebugValues());
   VarLocImpl = 
std::unique_ptr(llvm::makeVarLocBasedLiveDebugValues());
 }
 
-bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
+PreservedAnalyses
+LiveDebugValuesPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+  if (!LiveDebugValues().run(MF, ShouldEmitDebugEntryValues))
+return PreservedAnalyses::all();
+  auto PA = getMachineFunctionPassPreservedAnalyses();

arsenm wrote:

I would hope this preserves everything 

https://github.com/llvm/llvm-project/pull/131563
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NPM] Port LiveDebugValues to NPM (PR #131563)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -77,36 +78,68 @@ class LiveDebugValues : public MachineFunctionPass {
 AU.setPreservesCFG();
 MachineFunctionPass::getAnalysisUsage(AU);
   }
+};
+
+struct LiveDebugValues {
+  LiveDebugValues();
+  ~LiveDebugValues() = default;
+  bool run(MachineFunction &MF, bool ShouldEmitDebugEntryValues);
 
 private:
   std::unique_ptr InstrRefImpl;
   std::unique_ptr VarLocImpl;
-  TargetPassConfig *TPC = nullptr;
   MachineDominatorTree MDT;
 };
 } // namespace
 
-char LiveDebugValues::ID = 0;
+char LiveDebugValuesLegacy::ID = 0;
 
-char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
+char &llvm::LiveDebugValuesID = LiveDebugValuesLegacy::ID;
 
-INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis", 
false,
-false)
+INITIALIZE_PASS(LiveDebugValuesLegacy, DEBUG_TYPE, "Live DEBUG_VALUE analysis",
+false, false)
 
 /// Default construct and initialize the pass.
-LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
-  initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
+LiveDebugValuesLegacy::LiveDebugValuesLegacy() : MachineFunctionPass(ID) {
+  initializeLiveDebugValuesLegacyPass(*PassRegistry::getPassRegistry());
+}
+
+LiveDebugValues::LiveDebugValues() {
   InstrRefImpl =
   std::unique_ptr(llvm::makeInstrRefBasedLiveDebugValues());
   VarLocImpl = 
std::unique_ptr(llvm::makeVarLocBasedLiveDebugValues());
 }
 
-bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
+PreservedAnalyses
+LiveDebugValuesPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+  if (!LiveDebugValues().run(MF, ShouldEmitDebugEntryValues))
+return PreservedAnalyses::all();
+  auto PA = getMachineFunctionPassPreservedAnalyses();

arsenm wrote:

But this just copies what the old PM does 

https://github.com/llvm/llvm-project/pull/131563
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NPM] Port LiveDebugValues to NPM (PR #131563)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -0,0 +1,32 @@
+//===- llvm/CodeGen/LiveDebugValuesPass.h ---*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_CODEGEN_LIVEDEBUGVALUESPASS_H
+#define LLVM_CODEGEN_LIVEDEBUGVALUESPASS_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class LiveDebugValuesPass : public PassInfoMixin {
+  bool ShouldEmitDebugEntryValues;

arsenm wrote:

```suggestion
  const bool ShouldEmitDebugEntryValues;
```

https://github.com/llvm/llvm-project/pull/131563
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NPM] Port LiveDebugValues to NPM (PR #131563)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -77,36 +78,68 @@ class LiveDebugValues : public MachineFunctionPass {
 AU.setPreservesCFG();
 MachineFunctionPass::getAnalysisUsage(AU);
   }
+};
+
+struct LiveDebugValues {
+  LiveDebugValues();
+  ~LiveDebugValues() = default;
+  bool run(MachineFunction &MF, bool ShouldEmitDebugEntryValues);
 
 private:
   std::unique_ptr InstrRefImpl;
   std::unique_ptr VarLocImpl;
-  TargetPassConfig *TPC = nullptr;
   MachineDominatorTree MDT;
 };
 } // namespace
 
-char LiveDebugValues::ID = 0;
+char LiveDebugValuesLegacy::ID = 0;
 
-char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
+char &llvm::LiveDebugValuesID = LiveDebugValuesLegacy::ID;
 
-INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis", 
false,
-false)
+INITIALIZE_PASS(LiveDebugValuesLegacy, DEBUG_TYPE, "Live DEBUG_VALUE analysis",
+false, false)
 
 /// Default construct and initialize the pass.
-LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
-  initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
+LiveDebugValuesLegacy::LiveDebugValuesLegacy() : MachineFunctionPass(ID) {
+  initializeLiveDebugValuesLegacyPass(*PassRegistry::getPassRegistry());
+}
+
+LiveDebugValues::LiveDebugValues() {
   InstrRefImpl =
   std::unique_ptr(llvm::makeInstrRefBasedLiveDebugValues());
   VarLocImpl = 
std::unique_ptr(llvm::makeVarLocBasedLiveDebugValues());
 }
 
-bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
+PreservedAnalyses
+LiveDebugValuesPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+  if (!LiveDebugValues().run(MF, ShouldEmitDebugEntryValues))
+return PreservedAnalyses::all();
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserveSet();
+  return PA;
+}
+
+void LiveDebugValuesPass::printPipeline(
+raw_ostream &OS, function_ref MapClassName2PassName) 
{
+  OS << MapClassName2PassName(name());
+  if (ShouldEmitDebugEntryValues)
+OS << "";
+}
+
+bool LiveDebugValuesLegacy::runOnMachineFunction(MachineFunction &MF) {
+  auto *TPC = getAnalysisIfAvailable();
+  assert(TPC && "TargetPassConfig must be available");

arsenm wrote:

This should not be an assert. Should just be getAnalysis 

https://github.com/llvm/llvm-project/pull/131563
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [flang][OpenMP] Extend `do concurrent` mapping to multi-range loops (PR #127634)

2025-03-17 Thread Kareem Ergawy via llvm-branch-commits

https://github.com/ergawy updated 
https://github.com/llvm/llvm-project/pull/127634

>From 8b56c277f04c4f2d3a8a387d20454f7ddb86754c Mon Sep 17 00:00:00 2001
From: ergawy 
Date: Tue, 18 Feb 2025 06:17:17 -0600
Subject: [PATCH 1/2] [flang][OpenMP] Extend `do concurrent` mapping to
 multi-range loops

Adds support for converting mulit-range loops to OpenMP (on the host
only for now). The changes here "prepare" a loop nest for collapsing by
sinking iteration variables to the innermost `fir.do_loop` op in the
nest.
---
 flang/docs/DoConcurrentConversionToOpenMP.md  |  29 
 .../OpenMP/DoConcurrentConversion.cpp | 139 +-
 .../multiple_iteration_ranges.f90 |  72 +
 3 files changed, 239 insertions(+), 1 deletion(-)
 create mode 100644 
flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90

diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md 
b/flang/docs/DoConcurrentConversionToOpenMP.md
index 19611615ee9d6..ecb4428d7d3ba 100644
--- a/flang/docs/DoConcurrentConversionToOpenMP.md
+++ b/flang/docs/DoConcurrentConversionToOpenMP.md
@@ -173,6 +173,35 @@ omp.parallel {
 
 
 
+### Multi-range loops
+
+The pass currently supports multi-range loops as well. Given the following
+example:
+
+```fortran
+   do concurrent(i=1:n, j=1:m)
+   a(i,j) = i * j
+   end do
+```
+
+The generated `omp.loop_nest` operation look like:
+
+```
+omp.loop_nest (%arg0, %arg1)
+: index = (%17, %19) to (%18, %20)
+inclusive step (%c1_2, %c1_4) {
+  fir.store %arg0 to %private_i#1 : !fir.ref
+  fir.store %arg1 to %private_j#1 : !fir.ref
+  ...
+  omp.yield
+}
+```
+
+It is worth noting that we have privatized versions for both iteration
+variables: `i` and `j`. These are locally allocated inside the parallel/target
+OpenMP region similar to what the single-range example in previous section
+shows.
+
 

[llvm-branch-commits] [flang] [flang][OpenMP] Enable delayed privatization by default for `omp.distribute` (PR #131574)

2025-03-17 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Kareem Ergawy (ergawy)


Changes

Switches delayed privatization for `omp.distribute` to be on by default: 
controlled by the `-openmp-enable-delayed-privatization` instead of by 
`-openmp-enable-delayed-privatization-staging`

---
Full diff: https://github.com/llvm/llvm-project/pull/131574.diff


4 Files Affected:

- (modified) flang/lib/Lower/OpenMP/OpenMP.cpp (+1-1) 
- (modified) flang/test/Lower/OpenMP/distribute.f90 (+1-1) 
- (modified) flang/test/Lower/OpenMP/order-clause.f90 (+3-3) 
- (modified) flang/test/Transforms/stack-arrays-hlfir.f90 (+1-1) 


``diff
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp 
b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 2cfc1bd88dcef..f753ce1e82288 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2549,7 +2549,7 @@ static void 
genStandaloneDistribute(lower::AbstractConverter &converter,
 
   DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
/*shouldCollectPreDeterminedSymbols=*/true,
-   enableDelayedPrivatizationStaging, symTable);
+   enableDelayedPrivatization, symTable);
   dsp.processStep1(&distributeClauseOps);
 
   mlir::omp::LoopNestOperands loopNestClauseOps;
diff --git a/flang/test/Lower/OpenMP/distribute.f90 
b/flang/test/Lower/OpenMP/distribute.f90
index a4a753dddbac4..ea57d35b964b4 100644
--- a/flang/test/Lower/OpenMP/distribute.f90
+++ b/flang/test/Lower/OpenMP/distribute.f90
@@ -7,7 +7,7 @@ subroutine distribute_simple()
   ! CHECK: omp.teams
   !$omp teams
 
-  ! CHECK: omp.distribute {
+  ! CHECK: omp.distribute private({{.*}}) {
   !$omp distribute
 
   ! CHECK-NEXT: omp.loop_nest
diff --git a/flang/test/Lower/OpenMP/order-clause.f90 
b/flang/test/Lower/OpenMP/order-clause.f90
index 1f678e02708da..d5799079b3759 100644
--- a/flang/test/Lower/OpenMP/order-clause.f90
+++ b/flang/test/Lower/OpenMP/order-clause.f90
@@ -61,15 +61,15 @@ end subroutine do_simd_order_parallel
 
 
 subroutine distribute_order
-   !CHECK: omp.distribute order(reproducible:concurrent) {
+   !CHECK: omp.distribute order(reproducible:concurrent) private({{.*}}) {
!$omp teams distribute order(concurrent)
do i=1,10
end do
-   !CHECK: omp.distribute order(reproducible:concurrent) {
+   !CHECK: omp.distribute order(reproducible:concurrent) private({{.*}}) {
!$omp teams distribute order(reproducible:concurrent)
do i=1,10
end do
-   !CHECK: omp.distribute order(unconstrained:concurrent) {
+   !CHECK: omp.distribute order(unconstrained:concurrent) private({{.*}}) {
!$omp teams distribute order(unconstrained:concurrent)
do i = 1, 10
end do
diff --git a/flang/test/Transforms/stack-arrays-hlfir.f90 
b/flang/test/Transforms/stack-arrays-hlfir.f90
index 06749b7ca88af..e70a1d9b89216 100644
--- a/flang/test/Transforms/stack-arrays-hlfir.f90
+++ b/flang/test/Transforms/stack-arrays-hlfir.f90
@@ -73,7 +73,7 @@ end subroutine omp_target_wsloop
 ! CHECK-NOT:   fir.freemem
 ! CHECK: omp.teams {
 ! CHECK:   fir.alloca !fir.array<2xi64>
-! CHECK: omp.distribute {
+! CHECK: omp.distribute private({{.*}}) {
 ! CHECK: omp.loop_nest {{.*}} {
 ! CHECK-NOT:   fir.allocmem
 ! CHECK-NOT:   fir.freemem

``




https://github.com/llvm/llvm-project/pull/131574
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][SIFoldOperands] Fold some redundant bitmasks (PR #131311)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131311

>From 520757cf40d285b58eb0539840be2bf282c0a0af Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Fri, 14 Mar 2025 10:05:19 +0100
Subject: [PATCH 1/2] [AMDGPU][SIFoldOperands] Fold some redundant bitmasks

Instructions like shifts only read some of the bits of the shift amount 
operand, between 4 and 6 bits.
If the source operand is being masked, we can just ignore the mask.

Effects are minimal right now but this will kick in more once we disable 
uniform i16 operation widening in CGP.
With that disabled, we get more i16 shift amounts
that are zext'd and without this we'd end up with
more `s_and_b32 s1, s1, 0x` in the output.

Ideally ISel should handle this but it's proving difficult to get the patterns 
right, and after a few hours of trying I just decided to go with this as it's 
simple enough and it "just works" for this purpose.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp |  97 +++-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 201 -
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 207 --
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll|   6 +-
 llvm/test/CodeGen/AMDGPU/constrained-shift.ll |   1 -
 llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir |  26 +--
 8 files changed, 303 insertions(+), 251 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp 
b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 91df516b80857..a279a0a973e75 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -131,6 +131,7 @@ class SIFoldOperandsImpl {
   std::optional getImmOrMaterializedImm(MachineOperand &Op) const;
   bool tryConstantFoldOp(MachineInstr *MI) const;
   bool tryFoldCndMask(MachineInstr &MI) const;
+  bool tryFoldBitMask(MachineInstr &MI) const;
   bool tryFoldZeroHighBits(MachineInstr &MI) const;
   bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
 
@@ -1447,6 +1448,99 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr 
&MI) const {
   return true;
 }
 
+static bool getBitsReadByInst(unsigned Opc, unsigned &NumBitsRead,
+  unsigned &OpIdx) {
+  switch (Opc) {
+  case AMDGPU::V_ASHR_I32_e64:
+  case AMDGPU::V_ASHR_I32_e32:
+  case AMDGPU::V_LSHR_B32_e64:
+  case AMDGPU::V_LSHR_B32_e32:
+  case AMDGPU::V_LSHL_B32_e64:
+  case AMDGPU::V_LSHL_B32_e32:
+  case AMDGPU::S_LSHL_B32:
+  case AMDGPU::S_LSHR_B32:
+  case AMDGPU::S_ASHR_I32:
+NumBitsRead = 5;
+OpIdx = 2;
+return true;
+  case AMDGPU::S_LSHL_B64:
+  case AMDGPU::S_LSHR_B64:
+  case AMDGPU::S_ASHR_I64:
+NumBitsRead = 6;
+OpIdx = 2;
+return true;
+  case AMDGPU::V_LSHLREV_B32_e64:
+  case AMDGPU::V_LSHLREV_B32_e32:
+  case AMDGPU::V_LSHRREV_B32_e64:
+  case AMDGPU::V_LSHRREV_B32_e32:
+  case AMDGPU::V_ASHRREV_I32_e64:
+  case AMDGPU::V_ASHRREV_I32_e32:
+NumBitsRead = 5;
+OpIdx = 1;
+return true;
+  default:
+return false;
+  }
+}
+
+static bool isAndBitMaskRedundant(MachineInstr &MI, unsigned BitsNeeded,
+unsigned &SrcOp) {
+  MachineOperand *RegOp = &MI.getOperand(1);
+  MachineOperand *ImmOp = &MI.getOperand(2);
+
+  if (!RegOp->isReg() || !ImmOp->isImm()) {
+if (ImmOp->isReg() && RegOp->isImm())
+  std::swap(RegOp, ImmOp);
+else
+  return false;
+  }
+
+  SrcOp = RegOp->getOperandNo();
+
+  const unsigned BitMask = maskTrailingOnes(BitsNeeded);
+  return (ImmOp->getImm() & BitMask) == BitMask;
+}
+
+bool SIFoldOperandsImpl::tryFoldBitMask(MachineInstr &MI) const {
+  unsigned NumBitsRead = 0;
+  unsigned OpIdx = 0;
+  if (!getBitsReadByInst(MI.getOpcode(), NumBitsRead, OpIdx))
+return false;
+
+  MachineOperand &Op = MI.getOperand(OpIdx);
+  if (!Op.isReg())
+return false;
+
+  Register OpReg = Op.getReg();
+  if (OpReg.isPhysical())
+return false;
+
+  MachineInstr *OpDef = MRI->getVRegDef(OpReg);
+  if (!OpDef)
+return false ;
+
+  LLVM_DEBUG(dbgs() << "tryFoldBitMask: " << MI << "\tOpIdx:" << OpIdx << ", 
NumBitsRead:" << NumBitsRead << "\n");
+
+  unsigned ReplaceWith;
+  switch (OpDef->getOpcode()) {
+  // TODO: add more opcodes?
+  case AMDGPU::S_AND_B32:
+  case AMDGPU::V_AND_B32_e32:
+  case AMDGPU::V_AND_B32_e64:
+if (!isAndBitMaskRedundant(*OpDef, NumBitsRead, ReplaceWith))
+  return false;
+break;
+  default:
+return false;
+  }
+
+  MachineOperand &ReplaceWithOp = OpDef->getOperand(ReplaceWith);
+  LLVM_DEBUG(dbgs() << "\treplacing operand with:" << ReplaceWithOp << "\n");
+
+  MI.getOperand(OpIdx).setReg(ReplaceWithOp.getReg());
+  return true;
+}
+
 bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
   if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
   MI.getOpcode() != AMDGPU::V_AND_B32_e32)
@@ -1458,7 +1552,7 @@ bool SIFoldOperands

[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Combine (sext (trunc (sext_in_reg x))) (PR #131312)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131312

>From 4751d38d86886106c00e9140bf0bb3a3459950cb Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Fri, 14 Mar 2025 10:34:51 +0100
Subject: [PATCH] [AMDGPU][GlobalISel] Combine (sext (trunc (sext_in_reg x)))

This is a bit of an akward pattern that can come up as a result
of legalization and then widening of i16 operations to i32 in RegBankSelect
on AMDGPU.

This quick combine avoids redundant patterns like
```
s_sext_i32_i8 s0, s0
s_sext_i32_i16 s0, s0
s_ashr_i32 s0, s0, s1
```

With this the second sext is removed as it's redundant.
---
 .../include/llvm/Target/GlobalISel/Combine.td | 12 ++-
 .../combine-sext-trunc-sextinreg.mir  | 86 +++
 .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll | 78 -
 3 files changed, 113 insertions(+), 63 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir

diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td 
b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 3590ab221ad44..9727b86b4be8b 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -258,6 +258,14 @@ def sext_trunc_sextload : GICombineRule<
  [{ return Helper.matchSextTruncSextLoad(*${d}); }]),
   (apply [{ Helper.applySextTruncSextLoad(*${d}); }])>;
 
+def sext_trunc_sextinreg : GICombineRule<
+  (defs root:$dst),
+  (match (G_SEXT_INREG $sir, $src, $width),
+ (G_TRUNC $trunc, $sir),
+ (G_SEXT $dst, $trunc),
+ [{ return (MRI.getType(${trunc}.getReg()).getScalarSizeInBits() >= 
${width}.getImm()); }]),
+  (apply (GIReplaceReg $dst, $sir))>;
+
 def sext_inreg_of_load_matchdata : GIDefMatchData<"std::tuple">;
 def sext_inreg_of_load : GICombineRule<
   (defs root:$root, sext_inreg_of_load_matchdata:$matchinfo),
@@ -1896,7 +1904,9 @@ def cast_of_cast_combines: GICombineGroup<[
   sext_of_anyext,
   anyext_of_anyext,
   anyext_of_zext,
-  anyext_of_sext
+  anyext_of_sext,
+
+  sext_trunc_sextinreg
 ]>;
 
 def cast_combines: GICombineGroup<[
diff --git 
a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir
new file mode 100644
index 0..d41e5b172efc2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir
@@ -0,0 +1,86 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 
-run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | 
FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 
-run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: trunc_s16_inreg_8
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: trunc_s16_inreg_8
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
+; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 8
+%trunc:_(s16) = G_TRUNC %inreg
+%sext:_(s32) = G_SEXT %trunc
+$vgpr0 = COPY %sext
+...
+
+---
+name: trunc_s16_inreg_16
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: trunc_s16_inreg_16
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 16
+; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 16
+%trunc:_(s16) = G_TRUNC %inreg
+%sext:_(s32) = G_SEXT %trunc
+$vgpr0 = COPY %sext
+...
+
+---
+name: trunc_s8_inreg_16
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: trunc_s8_inreg_16
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 16
+; CHECK-NEXT: %trunc:_(s8) = G_TRUNC %inreg(s32)
+; CHECK-NEXT: %sext:_(s32) = G_SEXT %trunc(s8)
+; CHECK-NEXT: $vgpr0 = COPY %sext(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 16
+%trunc:_(s8) = G_TRUNC %inreg
+%sext:_(s32) = G_SEXT %trunc
+$vgpr0 = COPY %sext
+...
+
+# TODO?: We could handle this by inserting a trunc, but I'm not sure how 
useful that'd be.
+---
+name: mismatching_types
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: mismatching_types
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
+; CHECK-NEXT: %trunc:_(s8) = G_TRUNC %inreg(s32)
+; CHECK-NEXT: %sext:_(s16

[llvm-branch-commits] [llvm] [AMDGPU][RegBankInfo] Promote scalar i16 and/or/xor to i32 (PR #131306)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131306

>From 1af83464f02df212384bd97848b0073d41053234 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Wed, 5 Mar 2025 10:46:01 +0100
Subject: [PATCH 1/2] [AMDGPU][RegBankInfo] Promote scalar i16 and/or/xor to
 i32

See #64591
---
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  28 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll  |  10 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 519 --
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 286 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll   |  10 +-
 5 files changed, 403 insertions(+), 450 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c19ee14ab1574..27b86723ce474 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2416,9 +2416,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 Register DstReg = MI.getOperand(0).getReg();
 LLT DstTy = MRI.getType(DstReg);
 
-if (DstTy.getSizeInBits() == 1) {
-  const RegisterBank *DstBank =
+const RegisterBank *DstBank =
 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+
+if (DstTy.getSizeInBits() == 1) {
   if (DstBank == &AMDGPU::VCCRegBank)
 break;
 
@@ -2432,6 +2433,29 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   return;
 }
 
+// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
+// Packed 16-bit operations need to be scalarized and promoted.
+if (DstTy.getSizeInBits() == 16 && DstBank == &AMDGPU::SGPRRegBank) {
+  const LLT S32 = LLT::scalar(32);
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction *MF = MBB->getParent();
+  ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
+  LegalizerHelper Helper(*MF, ApplySALU, B);
+  // Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening
+  // will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1
+  // as "not".
+  if (MI.getOpcode() == AMDGPU::G_XOR &&
+  mi_match(MI.getOperand(2).getReg(), MRI, m_SpecificICstOrSplat(-1))) 
{
+Helper.widenScalarSrc(MI, S32, 1, AMDGPU::G_ANYEXT);
+Helper.widenScalarSrc(MI, S32, 2, AMDGPU::G_SEXT);
+Helper.widenScalarDst(MI, S32);
+  } else {
+if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
+  llvm_unreachable("widen scalar should have succeeded");
+  }
+  return;
+}
+
 if (DstTy.getSizeInBits() != 64)
   break;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 1a94429b1b5a1..36359579ea442 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -391,20 +391,20 @@ define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg 
%src0, i16 inreg %src1) {
 define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 
inreg %src1) {
 ; GCN-LABEL: s_andn2_i16_multi_use:
 ; GCN:   ; %bb.0:
-; GCN-NEXT:s_xor_b32 s1, s3, -1
+; GCN-NEXT:s_not_b32 s1, s3
 ; GCN-NEXT:s_andn2_b32 s0, s2, s3
 ; GCN-NEXT:; return to shader part epilog
 ;
 ; GFX10-LABEL: s_andn2_i16_multi_use:
 ; GFX10:   ; %bb.0:
 ; GFX10-NEXT:s_andn2_b32 s0, s2, s3
-; GFX10-NEXT:s_xor_b32 s1, s3, -1
+; GFX10-NEXT:s_not_b32 s1, s3
 ; GFX10-NEXT:; return to shader part epilog
 ;
 ; GFX11-LABEL: s_andn2_i16_multi_use:
 ; GFX11:   ; %bb.0:
 ; GFX11-NEXT:s_and_not1_b32 s0, s2, s3
-; GFX11-NEXT:s_xor_b32 s1, s3, -1
+; GFX11-NEXT:s_not_b32 s1, s3
 ; GFX11-NEXT:; return to shader part epilog
   %not.src1 = xor i16 %src1, -1
   %and = and i16 %src0, %not.src1
@@ -482,14 +482,14 @@ define amdgpu_ps float @v_andn2_i16_sv(i16 inreg %src0, 
i16 %src1) {
 define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
 ; GCN-LABEL: v_andn2_i16_vs:
 ; GCN:   ; %bb.0:
-; GCN-NEXT:s_xor_b32 s0, s2, -1
+; GCN-NEXT:s_not_b32 s0, s2
 ; GCN-NEXT:v_and_b32_e32 v0, s0, v0
 ; GCN-NEXT:v_and_b32_e32 v0, 0x, v0
 ; GCN-NEXT:; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: v_andn2_i16_vs:
 ; GFX10PLUS:   ; %bb.0:
-; GFX10PLUS-NEXT:s_xor_b32 s0, s2, -1
+; GFX10PLUS-NEXT:s_not_b32 s0, s2
 ; GFX10PLUS-NEXT:v_and_b32_e32 v0, s0, v0
 ; GFX10PLUS-NEXT:v_and_b32_e32 v0, 0x, v0
 ; GFX10PLUS-NEXT:; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index e60739fd84059..3a52497bd6e91 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -1052,17 +1052,14 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, 
i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:s_lshr_b32 s2, s2, s3
 ; GFX8-NEXT

[llvm-branch-commits] [llvm] [AMDGPU] Precommit si-fold-bitmask.mir (PR #131310)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131310

>From 6db5fe8cc5ff82cc7dc8751ac584870ddbf1b537 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Fri, 14 Mar 2025 10:00:21 +0100
Subject: [PATCH] [AMDGPU] Precommit si-fold-bitmask.mir

---
 llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir | 429 ++
 1 file changed, 429 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir

diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir 
b/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir
new file mode 100644
index 0..1edf970591179
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir
@@ -0,0 +1,429 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=si-fold-operands 
-verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s
+
+# Test supported instructions
+
+---
+name: v_ashr_i32_e64__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_ashr_i32_e64__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_ASHR_I32_e64 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_ASHR_I32_e64 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: v_lshr_b32_e64__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_lshr_b32_e64__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_LSHR_B32_e64 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_LSHR_B32_e64 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: v_lshr_b32_e32__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_lshr_b32_e32__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_LSHR_B32_e32 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_LSHR_B32_e32 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: v_lshl_b32_e64__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_lshl_b32_e64__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_LSHL_B32_e64 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_LSHL_B32_e64 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: s_lshl_b32__s_and_b32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $sgpr0, $sgpr1
+
+; GCN-LABEL: name: s_lshl_b32__s_and_b32
+; GCN: liveins: $sgpr0, $sgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:sgpr_32 = COPY $sgpr0
+; GCN-NEXT: %shift:sgpr_32 = COPY $sgpr1
+; GCN-NEXT: %shiftmask:sgpr_32 = S_AND_B32 65535, %shift, implicit-def $scc
+; GCN-NEXT: %ret:sgpr_32 = S_LSHL_B32 %src, %shiftmask, implicit-def $scc
+; GCN-NEXT: $sgpr0 = COPY %ret
+%src:sgpr_32 = COPY $sgpr0
+%shift:sgpr_32 = COPY $sgpr1
+%shiftmask:sgpr_32 = S_AND_B32 65535, %shift, implicit-def $scc
+%ret:sgpr_32 = S_LSHL_B32 %src, %shiftmask, implicit-def $scc
+$sgpr0 = COPY %ret
+...
+
+---
+name: s_lshr_b32__s_and_b32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $sgpr0, $sgpr1
+
+; GCN-LABEL: name: s_lshr_b32__s_and_b32
+; GCN: liveins: $sgpr0, $sgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:sgpr_32 = COPY $sgpr0
+; GCN-NEXT: %shift:sgpr_

[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Allow forming s16 U/SBFX pre-regbankselect (PR #131309)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131309

>From 090fa3eb8b5ebb595a6ec4b78ec337af71466a73 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Wed, 12 Mar 2025 09:43:15 +0100
Subject: [PATCH] [AMDGPU][GlobalISel] Allow forming s16 U/SBFX
 pre-regbankselect

Make s16 G_U/SBFX legal and widen them in RegBankSelect.
This allows the set of BFX formation combines to work on s16 types.
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   9 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  33 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 645 --
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 380 ---
 .../AMDGPU/GlobalISel/legalize-sbfx.mir   |  26 +-
 .../AMDGPU/GlobalISel/legalize-ubfx.mir   |  27 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |  27 +-
 7 files changed, 503 insertions(+), 644 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index cfb5c3b3006f0..ab900157d2095 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2069,10 +2069,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const 
GCNSubtarget &ST_,
   .minScalar(0, S32)
   .lower();
 
+  // Only {S32, S32} or {S32, S64} should ever reach codegen.
+  // We allow S/UBFX for S16 so the combiner can form them before
+  // RegBankSelect, and RegBankSelect will then legalize them correctly.
   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
-  .legalFor({{S32, S32}, {S64, S32}})
-  .clampScalar(1, S32, S32)
-  .clampScalar(0, S32, S64)
+  .legalFor({{S16, S16}, {S32, S32}, {S64, S32}})
+  .clampScalar(1, S16, S32)
+  .clampScalar(0, S16, S64)
   .widenScalarToNextPow2(0)
   .scalarize(0);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a7df9a0edd21a..844251be24c42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1485,7 +1485,9 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
   Register DstReg = MI.getOperand(0).getReg();
   LLT Ty = MRI.getType(DstReg);
 
+  const LLT S64 = LLT::scalar(64);
   const LLT S32 = LLT::scalar(32);
+  const LLT S16 = LLT::scalar(16);
 
   unsigned FirstOpnd = isa(MI) ? 2 : 1;
   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
@@ -1495,6 +1497,18 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
   const RegisterBank *DstBank =
 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   if (DstBank == &AMDGPU::VGPRRegBank) {
+if (Ty == S16) {
+  ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
+  B.setInsertPt(B.getMBB(), MI);
+  LegalizerHelper Helper(B.getMF(), ApplyBank, B);
+
+  Helper.widenScalarDst(MI, S32);
+  Helper.widenScalarSrc(MI, S32, 1, AMDGPU::G_ANYEXT);
+  Helper.widenScalarSrc(MI, S32, 2, AMDGPU::G_ZEXT);
+  Helper.widenScalarSrc(MI, S32, 3, AMDGPU::G_ZEXT);
+  return true;
+}
+
 if (Ty == S32)
   return true;
 
@@ -1554,6 +1568,11 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
 
   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
 
+  if (Ty == S16) {
+OffsetReg = B.buildAnyExtOrTrunc(S32, OffsetReg).getReg(0);
+WidthReg = B.buildAnyExtOrTrunc(S32, WidthReg).getReg(0);
+  }
+
   // Ensure the high bits are clear to insert the offset.
   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6));
   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
@@ -1568,13 +1587,21 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
 
   // TODO: It might be worth using a pseudo here to avoid scc clobber and
   // register class constraints.
-  unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
- (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+  unsigned Opc = (Ty != S64) ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
+ : (Signed ? AMDGPU::S_BFE_I64 : 
AMDGPU::S_BFE_U64);
 
-  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+  Register BFEDst = DstReg;
+  if (Ty == S16) {
+BFEDst = MRI.createGenericVirtualRegister(S32);
+MRI.setRegBank(BFEDst, AMDGPU::SGPRRegBank);
+  }
+  auto MIB = B.buildInstr(Opc, {BFEDst}, {SrcReg, MergedInputs});
   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
 llvm_unreachable("failed to constrain BFE");
 
+  if (BFEDst != DstReg)
+B.buildZExtOrTrunc(DstReg, BFEDst);
+
   MI.eraseFromParent();
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 07fcb02d98649..d2b600b04f9fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fsh

[llvm-branch-commits] [llvm] AMDGPU: Switch scheduler-subrange-crash.ll to generated checks (PR #131316)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Mar 17, 4:47 AM EDT**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/131316).


https://github.com/llvm/llvm-project/pull/131316
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Switch test to generated checks (PR #131315)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Mar 17, 4:47 AM EDT**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/131315).


https://github.com/llvm/llvm-project/pull/131315
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Precommit si-fold-bitmask.mir (PR #131310)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131310

>From 6db5fe8cc5ff82cc7dc8751ac584870ddbf1b537 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Fri, 14 Mar 2025 10:00:21 +0100
Subject: [PATCH] [AMDGPU] Precommit si-fold-bitmask.mir

---
 llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir | 429 ++
 1 file changed, 429 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir

diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir 
b/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir
new file mode 100644
index 0..1edf970591179
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir
@@ -0,0 +1,429 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=si-fold-operands 
-verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s
+
+# Test supported instructions
+
+---
+name: v_ashr_i32_e64__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_ashr_i32_e64__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_ASHR_I32_e64 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_ASHR_I32_e64 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: v_lshr_b32_e64__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_lshr_b32_e64__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_LSHR_B32_e64 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_LSHR_B32_e64 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: v_lshr_b32_e32__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_lshr_b32_e32__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_LSHR_B32_e32 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_LSHR_B32_e32 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: v_lshl_b32_e64__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_lshl_b32_e64__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_LSHL_B32_e64 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_LSHL_B32_e64 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: s_lshl_b32__s_and_b32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $sgpr0, $sgpr1
+
+; GCN-LABEL: name: s_lshl_b32__s_and_b32
+; GCN: liveins: $sgpr0, $sgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:sgpr_32 = COPY $sgpr0
+; GCN-NEXT: %shift:sgpr_32 = COPY $sgpr1
+; GCN-NEXT: %shiftmask:sgpr_32 = S_AND_B32 65535, %shift, implicit-def $scc
+; GCN-NEXT: %ret:sgpr_32 = S_LSHL_B32 %src, %shiftmask, implicit-def $scc
+; GCN-NEXT: $sgpr0 = COPY %ret
+%src:sgpr_32 = COPY $sgpr0
+%shift:sgpr_32 = COPY $sgpr1
+%shiftmask:sgpr_32 = S_AND_B32 65535, %shift, implicit-def $scc
+%ret:sgpr_32 = S_LSHL_B32 %src, %shiftmask, implicit-def $scc
+$sgpr0 = COPY %ret
+...
+
+---
+name: s_lshr_b32__s_and_b32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $sgpr0, $sgpr1
+
+; GCN-LABEL: name: s_lshr_b32__s_and_b32
+; GCN: liveins: $sgpr0, $sgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:sgpr_32 = COPY $sgpr0
+; GCN-NEXT: %shift:sgpr_

[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Allow forming s16 U/SBFX pre-regbankselect (PR #131309)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131309

>From 090fa3eb8b5ebb595a6ec4b78ec337af71466a73 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Wed, 12 Mar 2025 09:43:15 +0100
Subject: [PATCH] [AMDGPU][GlobalISel] Allow forming s16 U/SBFX
 pre-regbankselect

Make s16 G_U/SBFX legal and widen them in RegBankSelect.
This allows the set of BFX formation combines to work on s16 types.
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   9 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  33 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 645 --
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 380 ---
 .../AMDGPU/GlobalISel/legalize-sbfx.mir   |  26 +-
 .../AMDGPU/GlobalISel/legalize-ubfx.mir   |  27 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |  27 +-
 7 files changed, 503 insertions(+), 644 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index cfb5c3b3006f0..ab900157d2095 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2069,10 +2069,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const 
GCNSubtarget &ST_,
   .minScalar(0, S32)
   .lower();
 
+  // Only {S32, S32} or {S32, S64} should ever reach codegen.
+  // We allow S/UBFX for S16 so the combiner can form them before
+  // RegBankSelect, and RegBankSelect will then legalize them correctly.
   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
-  .legalFor({{S32, S32}, {S64, S32}})
-  .clampScalar(1, S32, S32)
-  .clampScalar(0, S32, S64)
+  .legalFor({{S16, S16}, {S32, S32}, {S64, S32}})
+  .clampScalar(1, S16, S32)
+  .clampScalar(0, S16, S64)
   .widenScalarToNextPow2(0)
   .scalarize(0);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a7df9a0edd21a..844251be24c42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1485,7 +1485,9 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
   Register DstReg = MI.getOperand(0).getReg();
   LLT Ty = MRI.getType(DstReg);
 
+  const LLT S64 = LLT::scalar(64);
   const LLT S32 = LLT::scalar(32);
+  const LLT S16 = LLT::scalar(16);
 
   unsigned FirstOpnd = isa(MI) ? 2 : 1;
   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
@@ -1495,6 +1497,18 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
   const RegisterBank *DstBank =
 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   if (DstBank == &AMDGPU::VGPRRegBank) {
+if (Ty == S16) {
+  ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
+  B.setInsertPt(B.getMBB(), MI);
+  LegalizerHelper Helper(B.getMF(), ApplyBank, B);
+
+  Helper.widenScalarDst(MI, S32);
+  Helper.widenScalarSrc(MI, S32, 1, AMDGPU::G_ANYEXT);
+  Helper.widenScalarSrc(MI, S32, 2, AMDGPU::G_ZEXT);
+  Helper.widenScalarSrc(MI, S32, 3, AMDGPU::G_ZEXT);
+  return true;
+}
+
 if (Ty == S32)
   return true;
 
@@ -1554,6 +1568,11 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
 
   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
 
+  if (Ty == S16) {
+OffsetReg = B.buildAnyExtOrTrunc(S32, OffsetReg).getReg(0);
+WidthReg = B.buildAnyExtOrTrunc(S32, WidthReg).getReg(0);
+  }
+
   // Ensure the high bits are clear to insert the offset.
   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6));
   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
@@ -1568,13 +1587,21 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
 
   // TODO: It might be worth using a pseudo here to avoid scc clobber and
   // register class constraints.
-  unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
- (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+  unsigned Opc = (Ty != S64) ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
+ : (Signed ? AMDGPU::S_BFE_I64 : 
AMDGPU::S_BFE_U64);
 
-  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+  Register BFEDst = DstReg;
+  if (Ty == S16) {
+BFEDst = MRI.createGenericVirtualRegister(S32);
+MRI.setRegBank(BFEDst, AMDGPU::SGPRRegBank);
+  }
+  auto MIB = B.buildInstr(Opc, {BFEDst}, {SrcReg, MergedInputs});
   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
 llvm_unreachable("failed to constrain BFE");
 
+  if (BFEDst != DstReg)
+B.buildZExtOrTrunc(DstReg, BFEDst);
+
   MI.eraseFromParent();
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 07fcb02d98649..d2b600b04f9fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fsh

[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Combine (sext (trunc (sext_in_reg x))) (PR #131312)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131312

>From 4751d38d86886106c00e9140bf0bb3a3459950cb Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Fri, 14 Mar 2025 10:34:51 +0100
Subject: [PATCH] [AMDGPU][GlobalISel] Combine (sext (trunc (sext_in_reg x)))

This is a bit of an akward pattern that can come up as a result
of legalization and then widening of i16 operations to i32 in RegBankSelect
on AMDGPU.

This quick combine avoids redundant patterns like
```
s_sext_i32_i8 s0, s0
s_sext_i32_i16 s0, s0
s_ashr_i32 s0, s0, s1
```

With this the second sext is removed as it's redundant.
---
 .../include/llvm/Target/GlobalISel/Combine.td | 12 ++-
 .../combine-sext-trunc-sextinreg.mir  | 86 +++
 .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll | 78 -
 3 files changed, 113 insertions(+), 63 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir

diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td 
b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 3590ab221ad44..9727b86b4be8b 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -258,6 +258,14 @@ def sext_trunc_sextload : GICombineRule<
  [{ return Helper.matchSextTruncSextLoad(*${d}); }]),
   (apply [{ Helper.applySextTruncSextLoad(*${d}); }])>;
 
+def sext_trunc_sextinreg : GICombineRule<
+  (defs root:$dst),
+  (match (G_SEXT_INREG $sir, $src, $width),
+ (G_TRUNC $trunc, $sir),
+ (G_SEXT $dst, $trunc),
+ [{ return (MRI.getType(${trunc}.getReg()).getScalarSizeInBits() >= 
${width}.getImm()); }]),
+  (apply (GIReplaceReg $dst, $sir))>;
+
 def sext_inreg_of_load_matchdata : GIDefMatchData<"std::tuple">;
 def sext_inreg_of_load : GICombineRule<
   (defs root:$root, sext_inreg_of_load_matchdata:$matchinfo),
@@ -1896,7 +1904,9 @@ def cast_of_cast_combines: GICombineGroup<[
   sext_of_anyext,
   anyext_of_anyext,
   anyext_of_zext,
-  anyext_of_sext
+  anyext_of_sext,
+
+  sext_trunc_sextinreg
 ]>;
 
 def cast_combines: GICombineGroup<[
diff --git 
a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir
new file mode 100644
index 0..d41e5b172efc2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir
@@ -0,0 +1,86 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 
-run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | 
FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 
-run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: trunc_s16_inreg_8
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: trunc_s16_inreg_8
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
+; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 8
+%trunc:_(s16) = G_TRUNC %inreg
+%sext:_(s32) = G_SEXT %trunc
+$vgpr0 = COPY %sext
+...
+
+---
+name: trunc_s16_inreg_16
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: trunc_s16_inreg_16
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 16
+; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 16
+%trunc:_(s16) = G_TRUNC %inreg
+%sext:_(s32) = G_SEXT %trunc
+$vgpr0 = COPY %sext
+...
+
+---
+name: trunc_s8_inreg_16
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: trunc_s8_inreg_16
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 16
+; CHECK-NEXT: %trunc:_(s8) = G_TRUNC %inreg(s32)
+; CHECK-NEXT: %sext:_(s32) = G_SEXT %trunc(s8)
+; CHECK-NEXT: $vgpr0 = COPY %sext(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 16
+%trunc:_(s8) = G_TRUNC %inreg
+%sext:_(s32) = G_SEXT %trunc
+$vgpr0 = COPY %sext
+...
+
+# TODO?: We could handle this by inserting a trunc, but I'm not sure how 
useful that'd be.
+---
+name: mismatching_types
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: mismatching_types
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
+; CHECK-NEXT: %trunc:_(s8) = G_TRUNC %inreg(s32)
+; CHECK-NEXT: %sext:_(s16

[llvm-branch-commits] [llvm] [AMDGPU][Legalizer] Widen i16 G_SEXT_INREG (PR #131308)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131308

>From be5c76eeb981e94017cc2a504f35079d47d7ce5c Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Wed, 5 Mar 2025 13:41:04 +0100
Subject: [PATCH 1/2] [AMDGPU][Legalizer] Widen i16 G_SEXT_INREG

It's better to widen them to avoid it being lowered into a G_ASHR + G_SHL. With 
this change we just extend to i32 then trunc the result.
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   3 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |   7 +-
 .../AMDGPU/GlobalISel/legalize-abs.mir|   8 +-
 .../AMDGPU/GlobalISel/legalize-ashr.mir   |  20 +--
 .../AMDGPU/GlobalISel/legalize-sext-inreg.mir | 155 +++---
 .../AMDGPU/GlobalISel/legalize-sext.mir   | 101 ++--
 .../AMDGPU/GlobalISel/legalize-smax.mir   |  33 +++-
 .../AMDGPU/GlobalISel/legalize-smin.mir   |  33 +++-
 .../AMDGPU/GlobalISel/legalize-smulh.mir  | 132 +++
 .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll |  45 ++---
 .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll   | 130 ++-
 11 files changed, 299 insertions(+), 368 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b3a8183beeacf..6e611ebb4b625 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2009,7 +2009,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const 
GCNSubtarget &ST_,
   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
   // RegBankSelect.
   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
-.legalFor({{S32}, {S64}});
+.legalFor({{S32}, {S64}})
+.widenScalarIf(typeIs(0, S16), widenScalarOrEltToNextPow2(0, 32));
 
   if (ST.hasVOP3PInsts()) {
 SextInReg.lowerFor({{V2S16}})
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 493e8cef63890..f81d7f1c300b8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -17,8 +17,7 @@ define i8 @v_ashr_i8(i8 %value, i8 %amount) {
 ; GFX8-LABEL: v_ashr_i8:
 ; GFX8:   ; %bb.0:
 ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX8-NEXT:v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i8:
@@ -49,8 +48,8 @@ define i8 @v_ashr_i8_7(i8 %value) {
 ; GFX8-LABEL: v_ashr_i8_7:
 ; GFX8:   ; %bb.0:
 ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:v_ashrrev_i16_e32 v0, 15, v0
+; GFX8-NEXT:v_mov_b32_e32 v1, 7
+; GFX8-NEXT:v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i8_7:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir
index a9fe80eb47e76..2b911b2dce697 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir
@@ -144,11 +144,9 @@ body: |
 ; VI: liveins: $vgpr0
 ; VI-NEXT: {{  $}}
 ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
-; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
-; VI-NEXT: [[ABS:%[0-9]+]]:_(s16) = G_ABS [[ASHR]]
+; VI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+; VI-NEXT: [[ABS:%[0-9]+]]:_(s16) = G_ABS [[TRUNC]]
 ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ABS]](s16)
 ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
index f4aaab745e03b..53905a2f49dd0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
@@ -319,12 +319,10 @@ body: |
 ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
 ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
 ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
-; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C1]](s16)
-; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C1]](s16)
-; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[ASHR]], [

[llvm-branch-commits] [llvm] [AMDGPU][SIFoldOperands] Fold some redundant bitmasks (PR #131311)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131311

>From 520757cf40d285b58eb0539840be2bf282c0a0af Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Fri, 14 Mar 2025 10:05:19 +0100
Subject: [PATCH 1/2] [AMDGPU][SIFoldOperands] Fold some redundant bitmasks

Instructions like shifts only read some of the bits of the shift amount 
operand, between 4 and 6 bits.
If the source operand is being masked, we can just ignore the mask.

Effects are minimal right now but this will kick in more once we disable 
uniform i16 operation widening in CGP.
With that disabled, we get more i16 shift amounts
that are zext'd and without this we'd end up with
more `s_and_b32 s1, s1, 0x` in the output.

Ideally ISel should handle this but it's proving difficult to get the patterns 
right, and after a few hours of trying I just decided to go with this as it's 
simple enough and it "just works" for this purpose.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp |  97 +++-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 201 -
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 207 --
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll|   6 +-
 llvm/test/CodeGen/AMDGPU/constrained-shift.ll |   1 -
 llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir |  26 +--
 8 files changed, 303 insertions(+), 251 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp 
b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 91df516b80857..a279a0a973e75 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -131,6 +131,7 @@ class SIFoldOperandsImpl {
   std::optional getImmOrMaterializedImm(MachineOperand &Op) const;
   bool tryConstantFoldOp(MachineInstr *MI) const;
   bool tryFoldCndMask(MachineInstr &MI) const;
+  bool tryFoldBitMask(MachineInstr &MI) const;
   bool tryFoldZeroHighBits(MachineInstr &MI) const;
   bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
 
@@ -1447,6 +1448,99 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr 
&MI) const {
   return true;
 }
 
+static bool getBitsReadByInst(unsigned Opc, unsigned &NumBitsRead,
+  unsigned &OpIdx) {
+  switch (Opc) {
+  case AMDGPU::V_ASHR_I32_e64:
+  case AMDGPU::V_ASHR_I32_e32:
+  case AMDGPU::V_LSHR_B32_e64:
+  case AMDGPU::V_LSHR_B32_e32:
+  case AMDGPU::V_LSHL_B32_e64:
+  case AMDGPU::V_LSHL_B32_e32:
+  case AMDGPU::S_LSHL_B32:
+  case AMDGPU::S_LSHR_B32:
+  case AMDGPU::S_ASHR_I32:
+NumBitsRead = 5;
+OpIdx = 2;
+return true;
+  case AMDGPU::S_LSHL_B64:
+  case AMDGPU::S_LSHR_B64:
+  case AMDGPU::S_ASHR_I64:
+NumBitsRead = 6;
+OpIdx = 2;
+return true;
+  case AMDGPU::V_LSHLREV_B32_e64:
+  case AMDGPU::V_LSHLREV_B32_e32:
+  case AMDGPU::V_LSHRREV_B32_e64:
+  case AMDGPU::V_LSHRREV_B32_e32:
+  case AMDGPU::V_ASHRREV_I32_e64:
+  case AMDGPU::V_ASHRREV_I32_e32:
+NumBitsRead = 5;
+OpIdx = 1;
+return true;
+  default:
+return false;
+  }
+}
+
+static bool isAndBitMaskRedundant(MachineInstr &MI, unsigned BitsNeeded,
+unsigned &SrcOp) {
+  MachineOperand *RegOp = &MI.getOperand(1);
+  MachineOperand *ImmOp = &MI.getOperand(2);
+
+  if (!RegOp->isReg() || !ImmOp->isImm()) {
+if (ImmOp->isReg() && RegOp->isImm())
+  std::swap(RegOp, ImmOp);
+else
+  return false;
+  }
+
+  SrcOp = RegOp->getOperandNo();
+
+  const unsigned BitMask = maskTrailingOnes(BitsNeeded);
+  return (ImmOp->getImm() & BitMask) == BitMask;
+}
+
+bool SIFoldOperandsImpl::tryFoldBitMask(MachineInstr &MI) const {
+  unsigned NumBitsRead = 0;
+  unsigned OpIdx = 0;
+  if (!getBitsReadByInst(MI.getOpcode(), NumBitsRead, OpIdx))
+return false;
+
+  MachineOperand &Op = MI.getOperand(OpIdx);
+  if (!Op.isReg())
+return false;
+
+  Register OpReg = Op.getReg();
+  if (OpReg.isPhysical())
+return false;
+
+  MachineInstr *OpDef = MRI->getVRegDef(OpReg);
+  if (!OpDef)
+return false ;
+
+  LLVM_DEBUG(dbgs() << "tryFoldBitMask: " << MI << "\tOpIdx:" << OpIdx << ", 
NumBitsRead:" << NumBitsRead << "\n");
+
+  unsigned ReplaceWith;
+  switch (OpDef->getOpcode()) {
+  // TODO: add more opcodes?
+  case AMDGPU::S_AND_B32:
+  case AMDGPU::V_AND_B32_e32:
+  case AMDGPU::V_AND_B32_e64:
+if (!isAndBitMaskRedundant(*OpDef, NumBitsRead, ReplaceWith))
+  return false;
+break;
+  default:
+return false;
+  }
+
+  MachineOperand &ReplaceWithOp = OpDef->getOperand(ReplaceWith);
+  LLVM_DEBUG(dbgs() << "\treplacing operand with:" << ReplaceWithOp << "\n");
+
+  MI.getOperand(OpIdx).setReg(ReplaceWithOp.getReg());
+  return true;
+}
+
 bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
   if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
   MI.getOpcode() != AMDGPU::V_AND_B32_e32)
@@ -1458,7 +1552,7 @@ bool SIFoldOperands

[llvm-branch-commits] [llvm] [SeparateConstOffsetFromGEP] Preserve inbounds flag based on ValueTracking (PR #130617)

2025-03-17 Thread Fabian Ritter via llvm-branch-commits

https://github.com/ritter-x2a updated 
https://github.com/llvm/llvm-project/pull/130617

>From eb82bfb8d8cd73cb79e91c77820d9b79d566195d Mon Sep 17 00:00:00 2001
From: Fabian Ritter 
Date: Mon, 10 Mar 2025 06:55:10 -0400
Subject: [PATCH 1/2] [SeparateConstOffsetFromGEP] Preserve inbounds flag based
 on ValueTracking

If we know that the initial GEP was inbounds, and we change it to a
sequence of GEPs from the same base pointer where every offset is
non-negative, then the new GEPs are inbounds.

For SWDEV-516125.
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp | 18 +++
 .../AMDGPU/preserve-inbounds.ll   | 23 +++
 .../NVPTX/split-gep-and-gvn.ll| 16 ++---
 .../NVPTX/split-gep.ll|  8 +++
 4 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp 
b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index ab8e979e7b40a..7f93115499bc9 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1052,6 +1052,8 @@ bool 
SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
 }
   }
 
+  bool MayRecoverInbounds = AccumulativeByteOffset >= 0 && GEP->isInBounds();
+
   // Remove the constant offset in each sequential index. The resultant GEP
   // computes the variadic base.
   // Notice that we don't remove struct field indices here. If LowerGEP is
@@ -1079,6 +1081,8 @@ bool 
SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
 // and the old index if they are not used.
 RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
 RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
+MayRecoverInbounds =
+MayRecoverInbounds && computeKnownBits(NewIdx, 
*DL).isNonNegative();
   }
 }
   }
@@ -1100,11 +1104,15 @@ bool 
SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // address with silently-wrapping two's complement arithmetic".
   // Therefore, the final code will be a semantically equivalent.
   //
-  // TODO(jingyue): do some range analysis to keep as many inbounds as
-  // possible. GEPs with inbounds are more friendly to alias analysis.
-  // TODO(gep_nowrap): Preserve nuw at least.
-  GEPNoWrapFlags NewGEPFlags = GEPNoWrapFlags::none();
-  GEP->setNoWrapFlags(GEPNoWrapFlags::none());
+  // If the initial GEP was inbounds and all variable indices and the
+  // accumulated offsets are non-negative, they can be added in any order and
+  // the intermediate results are in bounds. So, we can preserve the inbounds
+  // flag for both GEPs. GEPs with inbounds are more friendly to alias 
analysis.
+  //
+  // TODO(gep_nowrap): Preserve nuw?
+  GEPNoWrapFlags NewGEPFlags =
+  MayRecoverInbounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none();
+  GEP->setNoWrapFlags(NewGEPFlags);
 
   // Lowers a GEP to either GEPs with a single index or arithmetic operations.
   if (LowerGEP) {
diff --git 
a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll 
b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
index 422e5d8215502..01619aa481ddd 100644
--- 
a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
+++ 
b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
@@ -16,3 +16,26 @@ entry:
   %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idx
   ret ptr %arrayidx
 }
+
+; All offsets must be positive, so inbounds can be preserved.
+define void @must_be_inbounds(ptr %dst, ptr %src, i32 %i) {
+; CHECK-LABEL: @must_be_inbounds(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[I_PROM:%.*]] = zext i32 [[I:%.*]] to i64
+; CHECK-NEXT:[[TMP0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], 
i64 [[I_PROM]]
+; CHECK-NEXT:[[ARRAYIDX_SRC2:%.*]] = getelementptr inbounds i8, ptr 
[[TMP0]], i64 4
+; CHECK-NEXT:[[TMP1:%.*]] = load float, ptr [[ARRAYIDX_SRC2]], align 4
+; CHECK-NEXT:[[TMP2:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], 
i64 [[I_PROM]]
+; CHECK-NEXT:[[ARRAYIDX_DST4:%.*]] = getelementptr inbounds i8, ptr 
[[TMP2]], i64 4
+; CHECK-NEXT:store float [[TMP1]], ptr [[ARRAYIDX_DST4]], align 4
+; CHECK-NEXT:ret void
+;
+entry:
+  %i.prom = zext i32 %i to i64
+  %idx = add nsw i64 %i.prom, 1
+  %arrayidx.src = getelementptr inbounds float, ptr %src, i64 %idx
+  %3 = load float, ptr %arrayidx.src, align 4
+  %arrayidx.dst = getelementptr inbounds float, ptr %dst, i64 %idx
+  store float %3, ptr %arrayidx.dst, align 4
+  ret void
+}
diff --git 
a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll 
b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
index 9a73feb2c4b5c..4474585bf9b06 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/

[llvm-branch-commits] [llvm] [SeparateConstOffsetFromGEP] Preserve inbounds flag based on ValueTracking (PR #130617)

2025-03-17 Thread Fabian Ritter via llvm-branch-commits

https://github.com/ritter-x2a updated 
https://github.com/llvm/llvm-project/pull/130617

>From eb82bfb8d8cd73cb79e91c77820d9b79d566195d Mon Sep 17 00:00:00 2001
From: Fabian Ritter 
Date: Mon, 10 Mar 2025 06:55:10 -0400
Subject: [PATCH 1/2] [SeparateConstOffsetFromGEP] Preserve inbounds flag based
 on ValueTracking

If we know that the initial GEP was inbounds, and we change it to a
sequence of GEPs from the same base pointer where every offset is
non-negative, then the new GEPs are inbounds.

For SWDEV-516125.
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp | 18 +++
 .../AMDGPU/preserve-inbounds.ll   | 23 +++
 .../NVPTX/split-gep-and-gvn.ll| 16 ++---
 .../NVPTX/split-gep.ll|  8 +++
 4 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp 
b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index ab8e979e7b40a..7f93115499bc9 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1052,6 +1052,8 @@ bool 
SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
 }
   }
 
+  bool MayRecoverInbounds = AccumulativeByteOffset >= 0 && GEP->isInBounds();
+
   // Remove the constant offset in each sequential index. The resultant GEP
   // computes the variadic base.
   // Notice that we don't remove struct field indices here. If LowerGEP is
@@ -1079,6 +1081,8 @@ bool 
SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
 // and the old index if they are not used.
 RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
 RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
+MayRecoverInbounds =
+MayRecoverInbounds && computeKnownBits(NewIdx, 
*DL).isNonNegative();
   }
 }
   }
@@ -1100,11 +1104,15 @@ bool 
SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // address with silently-wrapping two's complement arithmetic".
   // Therefore, the final code will be a semantically equivalent.
   //
-  // TODO(jingyue): do some range analysis to keep as many inbounds as
-  // possible. GEPs with inbounds are more friendly to alias analysis.
-  // TODO(gep_nowrap): Preserve nuw at least.
-  GEPNoWrapFlags NewGEPFlags = GEPNoWrapFlags::none();
-  GEP->setNoWrapFlags(GEPNoWrapFlags::none());
+  // If the initial GEP was inbounds and all variable indices and the
+  // accumulated offsets are non-negative, they can be added in any order and
+  // the intermediate results are in bounds. So, we can preserve the inbounds
+  // flag for both GEPs. GEPs with inbounds are more friendly to alias 
analysis.
+  //
+  // TODO(gep_nowrap): Preserve nuw?
+  GEPNoWrapFlags NewGEPFlags =
+  MayRecoverInbounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none();
+  GEP->setNoWrapFlags(NewGEPFlags);
 
   // Lowers a GEP to either GEPs with a single index or arithmetic operations.
   if (LowerGEP) {
diff --git 
a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll 
b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
index 422e5d8215502..01619aa481ddd 100644
--- 
a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
+++ 
b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
@@ -16,3 +16,26 @@ entry:
   %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idx
   ret ptr %arrayidx
 }
+
+; All offsets must be positive, so inbounds can be preserved.
+define void @must_be_inbounds(ptr %dst, ptr %src, i32 %i) {
+; CHECK-LABEL: @must_be_inbounds(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[I_PROM:%.*]] = zext i32 [[I:%.*]] to i64
+; CHECK-NEXT:[[TMP0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], 
i64 [[I_PROM]]
+; CHECK-NEXT:[[ARRAYIDX_SRC2:%.*]] = getelementptr inbounds i8, ptr 
[[TMP0]], i64 4
+; CHECK-NEXT:[[TMP1:%.*]] = load float, ptr [[ARRAYIDX_SRC2]], align 4
+; CHECK-NEXT:[[TMP2:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], 
i64 [[I_PROM]]
+; CHECK-NEXT:[[ARRAYIDX_DST4:%.*]] = getelementptr inbounds i8, ptr 
[[TMP2]], i64 4
+; CHECK-NEXT:store float [[TMP1]], ptr [[ARRAYIDX_DST4]], align 4
+; CHECK-NEXT:ret void
+;
+entry:
+  %i.prom = zext i32 %i to i64
+  %idx = add nsw i64 %i.prom, 1
+  %arrayidx.src = getelementptr inbounds float, ptr %src, i64 %idx
+  %3 = load float, ptr %arrayidx.src, align 4
+  %arrayidx.dst = getelementptr inbounds float, ptr %dst, i64 %idx
+  store float %3, ptr %arrayidx.dst, align 4
+  ret void
+}
diff --git 
a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll 
b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
index 9a73feb2c4b5c..4474585bf9b06 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/

[llvm-branch-commits] [llvm] [AMDGPU][RegBankCombiner] Add cast_of_cast and constant_fold_cast combines (PR #131307)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

Pierre-vh wrote:

### Merge activity

* **Mar 17, 4:51 AM EDT**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/131307).


https://github.com/llvm/llvm-project/pull/131307
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][RegBankInfo] Promote scalar i16 and/or/xor to i32 (PR #131306)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

Pierre-vh wrote:

### Merge activity

* **Mar 17, 4:51 AM EDT**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/131306).


https://github.com/llvm/llvm-project/pull/131306
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][Legalizer] Widen i16 G_SEXT_INREG (PR #131308)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131308

>From be5c76eeb981e94017cc2a504f35079d47d7ce5c Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Wed, 5 Mar 2025 13:41:04 +0100
Subject: [PATCH 1/2] [AMDGPU][Legalizer] Widen i16 G_SEXT_INREG

It's better to widen them to avoid it being lowered into a G_ASHR + G_SHL. With 
this change we just extend to i32 then trunc the result.
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   3 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |   7 +-
 .../AMDGPU/GlobalISel/legalize-abs.mir|   8 +-
 .../AMDGPU/GlobalISel/legalize-ashr.mir   |  20 +--
 .../AMDGPU/GlobalISel/legalize-sext-inreg.mir | 155 +++---
 .../AMDGPU/GlobalISel/legalize-sext.mir   | 101 ++--
 .../AMDGPU/GlobalISel/legalize-smax.mir   |  33 +++-
 .../AMDGPU/GlobalISel/legalize-smin.mir   |  33 +++-
 .../AMDGPU/GlobalISel/legalize-smulh.mir  | 132 +++
 .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll |  45 ++---
 .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll   | 130 ++-
 11 files changed, 299 insertions(+), 368 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b3a8183beeacf..6e611ebb4b625 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2009,7 +2009,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const 
GCNSubtarget &ST_,
   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
   // RegBankSelect.
   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
-.legalFor({{S32}, {S64}});
+.legalFor({{S32}, {S64}})
+.widenScalarIf(typeIs(0, S16), widenScalarOrEltToNextPow2(0, 32));
 
   if (ST.hasVOP3PInsts()) {
 SextInReg.lowerFor({{V2S16}})
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 493e8cef63890..f81d7f1c300b8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -17,8 +17,7 @@ define i8 @v_ashr_i8(i8 %value, i8 %amount) {
 ; GFX8-LABEL: v_ashr_i8:
 ; GFX8:   ; %bb.0:
 ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX8-NEXT:v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i8:
@@ -49,8 +48,8 @@ define i8 @v_ashr_i8_7(i8 %value) {
 ; GFX8-LABEL: v_ashr_i8_7:
 ; GFX8:   ; %bb.0:
 ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:v_ashrrev_i16_e32 v0, 15, v0
+; GFX8-NEXT:v_mov_b32_e32 v1, 7
+; GFX8-NEXT:v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i8_7:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir
index a9fe80eb47e76..2b911b2dce697 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir
@@ -144,11 +144,9 @@ body: |
 ; VI: liveins: $vgpr0
 ; VI-NEXT: {{  $}}
 ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
-; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
-; VI-NEXT: [[ABS:%[0-9]+]]:_(s16) = G_ABS [[ASHR]]
+; VI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+; VI-NEXT: [[ABS:%[0-9]+]]:_(s16) = G_ABS [[TRUNC]]
 ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ABS]](s16)
 ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
index f4aaab745e03b..53905a2f49dd0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
@@ -319,12 +319,10 @@ body: |
 ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
 ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
 ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
-; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C1]](s16)
-; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C1]](s16)
-; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[ASHR]], [

[llvm-branch-commits] [llvm] AMDGPU: Switch simplifydemandedbits-recursion.ll to generated checks (PR #131317)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Mar 17, 4:47 AM EDT**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/131317).


https://github.com/llvm/llvm-project/pull/131317
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][RegBankInfo] Promote scalar i16 and/or/xor to i32 (PR #131306)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits


@@ -2432,6 +2433,29 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   return;
 }
 
+// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
+// Packed 16-bit operations need to be scalarized and promoted.

Pierre-vh wrote:

It was copy pasted from below and I forgot to remove it, it's irrelevant here

https://github.com/llvm/llvm-project/pull/131306
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [llvm] Add option to emit `callgraph` section (PR #87574)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -0,0 +1,93 @@
+# Test MIR printer and parser for type id field in callSites. It is used
+# for propogating call site type identifiers to emit in the call graph section.
+
+# RUN: llc --call-graph-section %s -run-pass=none -o - | FileCheck %s
+# CHECK: name: main
+# CHECK: callSites:
+# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: []
+# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [], typeId:
+# CHECK-NEXT: 1234567890 }
+
+--- |  

arsenm wrote:

Don't need so much IR content. Do you need the function bodies at all? 

https://github.com/llvm/llvm-project/pull/87574
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [OpenMP][MLIR] Refactor code related to collecting privatizer info into a shared util (PR #131582)

2025-03-17 Thread via llvm-branch-commits

llvmbot wrote:



@llvm/pr-subscribers-mlir-openmp

@llvm/pr-subscribers-flang-openmp

Author: Kareem Ergawy (ergawy)


Changes

Moves code needed to collect info about delayed privatizers into a shared util 
instread of repeating the same patter across all relevant constructs.

---

Patch is 22.93 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/131582.diff


1 Files Affected:

- (modified) 
mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+104-147) 


``diff
diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 17d0a7007729f..315c6b8ccc553 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -696,20 +696,42 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase 
&builder,
   return success();
 }
 
-/// Populates `privatizations` with privatization declarations used for the
-/// given op.
-template 
-static void collectPrivatizationDecls(
-OP op, SmallVectorImpl &privatizations) {
-  std::optional attr = op.getPrivateSyms();
-  if (!attr)
-return;
+/// A util to collect info needed to convert delayed privatizers from MLIR to
+/// LLVM.
+struct PrivateVarsInfo {
+  template 
+  PrivateVarsInfo(OP op)
+  : privateBlockArgs(
+cast(*op).getPrivateBlockArgs()) {
+mlirPrivateVars.reserve(privateBlockArgs.size());
+llvmPrivateVars.reserve(privateBlockArgs.size());
+collectPrivatizationDecls(op, privateDecls);
 
-  privatizations.reserve(privatizations.size() + attr->size());
-  for (auto symbolRef : attr->getAsRange()) {
-privatizations.push_back(findPrivatizer(op, symbolRef));
+for (mlir::Value privateVar : op.getPrivateVars())
+  mlirPrivateVars.push_back(privateVar);
   }
-}
+
+  MutableArrayRef privateBlockArgs;
+  SmallVector mlirPrivateVars;
+  SmallVector llvmPrivateVars;
+  SmallVector privateDecls;
+
+private:
+  /// Populates `privatizations` with privatization declarations used for the
+  /// given op.
+  template 
+  static void collectPrivatizationDecls(
+  OP op, SmallVectorImpl &privatizations) {
+std::optional attr = op.getPrivateSyms();
+if (!attr)
+  return;
+
+privatizations.reserve(privatizations.size() + attr->size());
+for (auto symbolRef : attr->getAsRange()) {
+  privatizations.push_back(findPrivatizer(op, symbolRef));
+}
+  }
+};
 
 /// Populates `reductions` with reduction declarations used in the given op.
 template 
@@ -1384,19 +1406,18 @@ static llvm::Expected initPrivateVar(
 static llvm::Error
 initPrivateVars(llvm::IRBuilderBase &builder,
 LLVM::ModuleTranslation &moduleTranslation,
-MutableArrayRef privateBlockArgs,
-MutableArrayRef privateDecls,
-MutableArrayRef mlirPrivateVars,
-llvm::SmallVectorImpl &llvmPrivateVars,
+PrivateVarsInfo &privateVarsInfo,
 llvm::DenseMap *mappedPrivateVars = nullptr) {
-  if (privateBlockArgs.empty())
+  if (privateVarsInfo.privateBlockArgs.empty())
 return llvm::Error::success();
 
   llvm::BasicBlock *privInitBlock = splitBB(builder, true, "omp.private.init");
   setInsertPointForPossiblyEmptyBlock(builder, privInitBlock);
 
   for (auto [idx, zip] : llvm::enumerate(llvm::zip_equal(
-   privateDecls, mlirPrivateVars, privateBlockArgs, llvmPrivateVars))) 
{
+   privateVarsInfo.privateDecls, privateVarsInfo.mlirPrivateVars,
+   privateVarsInfo.privateBlockArgs,
+   privateVarsInfo.llvmPrivateVars))) {
 auto [privDecl, mlirPrivVar, blockArg, llvmPrivateVar] = zip;
 llvm::Expected privVarOrErr = initPrivateVar(
 builder, moduleTranslation, privDecl, mlirPrivVar, blockArg,
@@ -1420,10 +1441,7 @@ initPrivateVars(llvm::IRBuilderBase &builder,
 static llvm::Expected
 allocatePrivateVars(llvm::IRBuilderBase &builder,
 LLVM::ModuleTranslation &moduleTranslation,
-MutableArrayRef privateBlockArgs,
-MutableArrayRef privateDecls,
-MutableArrayRef mlirPrivateVars,
-llvm::SmallVectorImpl &llvmPrivateVars,
+PrivateVarsInfo &privateVarsInfo,
 const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
 llvm::DenseMap *mappedPrivateVars = nullptr) 
{
   // Allocate private vars
@@ -1449,8 +1467,9 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
->getDataLayout()
.getProgramAddressSpace();
 
-  for (auto [privDecl, mlirPrivVar, blockArg] :
-   llvm::zip_equal(privateDecls, mlirPrivateVars, privateBlockArgs)) {
+  for (auto [privDecl, mlirPrivVar, blockArg] : llvm::zip_equal(
+   privateVarsInfo.privateDecls, priv

[llvm-branch-commits] [llvm] [AMDGPU] Precommit si-fold-bitmask.mir (PR #131310)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> Where and how should that be implemented ? I struggled with that. I tried 
> adding a new special case in TableGen but I just couldn't find the right way 
> to do it. Do I just add it in C++ InstructionSelector before it checks the 
> patterns? Or should it be some kind of post-processing step after the shift 
> has been selected, but before the G_ZEXT is selected?

It already exists as a complex pattern, isUnneededShiftMask. The combiners 
should be trying to get the clamping code into this form which expands the and 

https://github.com/llvm/llvm-project/pull/131310
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [llvm] Add option to emit `callgraph` section (PR #87574)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -0,0 +1,93 @@
+# Test MIR printer and parser for type id field in callSites. It is used
+# for propogating call site type identifiers to emit in the call graph section.

arsenm wrote:

```suggestion
# for propagating call site type identifiers to emit in the call graph section.
```

https://github.com/llvm/llvm-project/pull/87574
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Combine (sext (trunc (sext_in_reg x))) (PR #131312)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh closed 
https://github.com/llvm/llvm-project/pull/131312
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][SIFoldOperands] Fold some redundant bitmasks (PR #131311)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh closed 
https://github.com/llvm/llvm-project/pull/131311
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/20.x: [SCEV] Check whether the start is non-zero in `ScalarEvolution::howFarToZero` (#131522) (PR #131568)

2025-03-17 Thread via llvm-branch-commits

llvmbot wrote:

@dtcxzyw What do you think about merging this PR to the release branch?

https://github.com/llvm/llvm-project/pull/131568
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [flang][OpenMP] Extend `do concurrent` mapping to multi-range loops (PR #127634)

2025-03-17 Thread Kareem Ergawy via llvm-branch-commits

https://github.com/ergawy updated 
https://github.com/llvm/llvm-project/pull/127634

>From 8b56c277f04c4f2d3a8a387d20454f7ddb86754c Mon Sep 17 00:00:00 2001
From: ergawy 
Date: Tue, 18 Feb 2025 06:17:17 -0600
Subject: [PATCH 1/2] [flang][OpenMP] Extend `do concurrent` mapping to
 multi-range loops

Adds support for converting mulit-range loops to OpenMP (on the host
only for now). The changes here "prepare" a loop nest for collapsing by
sinking iteration variables to the innermost `fir.do_loop` op in the
nest.
---
 flang/docs/DoConcurrentConversionToOpenMP.md  |  29 
 .../OpenMP/DoConcurrentConversion.cpp | 139 +-
 .../multiple_iteration_ranges.f90 |  72 +
 3 files changed, 239 insertions(+), 1 deletion(-)
 create mode 100644 
flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90

diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md 
b/flang/docs/DoConcurrentConversionToOpenMP.md
index 19611615ee9d6..ecb4428d7d3ba 100644
--- a/flang/docs/DoConcurrentConversionToOpenMP.md
+++ b/flang/docs/DoConcurrentConversionToOpenMP.md
@@ -173,6 +173,35 @@ omp.parallel {
 
 
 
+### Multi-range loops
+
+The pass currently supports multi-range loops as well. Given the following
+example:
+
+```fortran
+   do concurrent(i=1:n, j=1:m)
+   a(i,j) = i * j
+   end do
+```
+
+The generated `omp.loop_nest` operation look like:
+
+```
+omp.loop_nest (%arg0, %arg1)
+: index = (%17, %19) to (%18, %20)
+inclusive step (%c1_2, %c1_4) {
+  fir.store %arg0 to %private_i#1 : !fir.ref
+  fir.store %arg1 to %private_j#1 : !fir.ref
+  ...
+  omp.yield
+}
+```
+
+It is worth noting that we have privatized versions for both iteration
+variables: `i` and `j`. These are locally allocated inside the parallel/target
+OpenMP region similar to what the single-range example in previous section
+shows.
+
 

[llvm-branch-commits] [flang] [flang][OpenMP] Extend `do concurrent` mapping to multi-range loops (PR #127634)

2025-03-17 Thread Kareem Ergawy via llvm-branch-commits

https://github.com/ergawy updated 
https://github.com/llvm/llvm-project/pull/127634

>From 8b56c277f04c4f2d3a8a387d20454f7ddb86754c Mon Sep 17 00:00:00 2001
From: ergawy 
Date: Tue, 18 Feb 2025 06:17:17 -0600
Subject: [PATCH 1/2] [flang][OpenMP] Extend `do concurrent` mapping to
 multi-range loops

Adds support for converting mulit-range loops to OpenMP (on the host
only for now). The changes here "prepare" a loop nest for collapsing by
sinking iteration variables to the innermost `fir.do_loop` op in the
nest.
---
 flang/docs/DoConcurrentConversionToOpenMP.md  |  29 
 .../OpenMP/DoConcurrentConversion.cpp | 139 +-
 .../multiple_iteration_ranges.f90 |  72 +
 3 files changed, 239 insertions(+), 1 deletion(-)
 create mode 100644 
flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90

diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md 
b/flang/docs/DoConcurrentConversionToOpenMP.md
index 19611615ee9d6..ecb4428d7d3ba 100644
--- a/flang/docs/DoConcurrentConversionToOpenMP.md
+++ b/flang/docs/DoConcurrentConversionToOpenMP.md
@@ -173,6 +173,35 @@ omp.parallel {
 
 
 
+### Multi-range loops
+
+The pass currently supports multi-range loops as well. Given the following
+example:
+
+```fortran
+   do concurrent(i=1:n, j=1:m)
+   a(i,j) = i * j
+   end do
+```
+
+The generated `omp.loop_nest` operation look like:
+
+```
+omp.loop_nest (%arg0, %arg1)
+: index = (%17, %19) to (%18, %20)
+inclusive step (%c1_2, %c1_4) {
+  fir.store %arg0 to %private_i#1 : !fir.ref
+  fir.store %arg1 to %private_j#1 : !fir.ref
+  ...
+  omp.yield
+}
+```
+
+It is worth noting that we have privatized versions for both iteration
+variables: `i` and `j`. These are locally allocated inside the parallel/target
+OpenMP region similar to what the single-range example in previous section
+shows.
+
 

[llvm-branch-commits] [flang] [flang][OpenMP] Extend `do concurrent` mapping to multi-range loops (PR #127634)

2025-03-17 Thread Kareem Ergawy via llvm-branch-commits


@@ -102,6 +105,47 @@ mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp 
doLoop) {
   return result.getDefiningOp();
 }
 
+/// Collects the op(s) responsible for updating a loop's iteration variable 
with
+/// the current iteration number. For example, for the input IR:

ergawy wrote:

Simplified the function to match the current flang pattern. I will mark the 
above comments as resolved since they don't apply anymore.

https://github.com/llvm/llvm-project/pull/127634
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [flang][OpenMP] Handle "loop-local values" in `do concurrent` nests (PR #127635)

2025-03-17 Thread Kareem Ergawy via llvm-branch-commits

https://github.com/ergawy updated 
https://github.com/llvm/llvm-project/pull/127635

>From caa2a3061021ca0d67246e3f7f575141aeed9802 Mon Sep 17 00:00:00 2001
From: ergawy 
Date: Tue, 18 Feb 2025 06:40:19 -0600
Subject: [PATCH] [flang][OpenMP] Handle "loop-local values" in `do concurrent`
 nests

Extends `do concurrent` mapping to handle "loop-local values". A loop-local
value is one that is used exclusively inside the loop but allocated outside
of it. This usually corresponds to temporary values that are used inside the
loop body for initialzing other variables for example. After collecting these
values, the pass localizes them to the loop nest by moving their allocations.
---
 flang/docs/DoConcurrentConversionToOpenMP.md  | 51 ++
 .../OpenMP/DoConcurrentConversion.cpp | 68 ++-
 .../DoConcurrent/locally_destroyed_temp.f90   | 62 +
 3 files changed, 180 insertions(+), 1 deletion(-)
 create mode 100644 
flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90

diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md 
b/flang/docs/DoConcurrentConversionToOpenMP.md
index ecb4428d7d3ba..76c54f5bbf587 100644
--- a/flang/docs/DoConcurrentConversionToOpenMP.md
+++ b/flang/docs/DoConcurrentConversionToOpenMP.md
@@ -202,6 +202,57 @@ variables: `i` and `j`. These are locally allocated inside 
the parallel/target
 OpenMP region similar to what the single-range example in previous section
 shows.
 
+### Data environment
+
+By default, variables that are used inside a `do concurrent` loop nest are
+either treated as `shared` in case of mapping to `host`, or mapped into the
+`target` region using a `map` clause in case of mapping to `device`. The only
+exceptions to this are:
+  1. the loop's iteration variable(s) (IV) of **perfect** loop nests. In that
+ case, for each IV, we allocate a local copy as shown by the mapping
+ examples above.
+  1. any values that are from allocations outside the loop nest and used
+ exclusively inside of it. In such cases, a local privatized
+ copy is created in the OpenMP region to prevent multiple teams of threads
+ from accessing and destroying the same memory block, which causes runtime
+ issues. For an example of such cases, see
+ `flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90`.
+
+Implicit mapping detection (for mapping to the target device) is still quite
+limited and work to make it smarter is underway for both OpenMP in general 
+and `do concurrent` mapping.
+
+ Non-perfectly-nested loops' IVs
+
+For non-perfectly-nested loops, the IVs are still treated as `shared` or
+`map` entries as pointed out above. This **might not** be consistent with what
+the Fortran specification tells us. In particular, taking the following
+snippets from the spec (version 2023) into account:
+
+> § 3.35
+> --
+> construct entity
+> entity whose identifier has the scope of a construct
+
+> § 19.4
+> --
+>  A variable that appears as an index-name in a FORALL or DO CONCURRENT
+>  construct [...] is a construct entity. A variable that has LOCAL or
+>  LOCAL_INIT locality in a DO CONCURRENT construct is a construct entity.
+> [...]
+> The name of a variable that appears as an index-name in a DO CONCURRENT
+> construct, FORALL statement, or FORALL construct has a scope of the statement
+> or construct. A variable that has LOCAL or LOCAL_INIT locality in a DO
+> CONCURRENT construct has the scope of that construct.
+
+From the above quotes, it seems there is an equivalence between the IV of a `do
+concurrent` loop and a variable with a `LOCAL` locality specifier (equivalent
+to OpenMP's `private` clause). Which means that we should probably
+localize/privatize a `do concurrent` loop's IV even if it is not perfectly
+nested in the nest we are parallelizing. For now, however, we **do not** do
+that as pointed out previously. In the near future, we propose a middle-ground
+solution (see the Next steps section for more details).
+
 

[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Allow forming s16 U/SBFX pre-regbankselect (PR #131309)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131309

>From d65db023bfae0c9a5eaeb5bebac39d75723c27d6 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Wed, 12 Mar 2025 09:43:15 +0100
Subject: [PATCH] [AMDGPU][GlobalISel] Allow forming s16 U/SBFX
 pre-regbankselect

Make s16 G_U/SBFX legal and widen them in RegBankSelect.
This allows the set of BFX formation combines to work on s16 types.
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   9 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  33 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 645 --
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 380 ---
 .../AMDGPU/GlobalISel/legalize-sbfx.mir   |  26 +-
 .../AMDGPU/GlobalISel/legalize-ubfx.mir   |  27 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |  27 +-
 7 files changed, 503 insertions(+), 644 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index cfb5c3b3006f0..ab900157d2095 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2069,10 +2069,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const 
GCNSubtarget &ST_,
   .minScalar(0, S32)
   .lower();
 
+  // Only {S32, S32} or {S32, S64} should ever reach codegen.
+  // We allow S/UBFX for S16 so the combiner can form them before
+  // RegBankSelect, and RegBankSelect will then legalize them correctly.
   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
-  .legalFor({{S32, S32}, {S64, S32}})
-  .clampScalar(1, S32, S32)
-  .clampScalar(0, S32, S64)
+  .legalFor({{S16, S16}, {S32, S32}, {S64, S32}})
+  .clampScalar(1, S16, S32)
+  .clampScalar(0, S16, S64)
   .widenScalarToNextPow2(0)
   .scalarize(0);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b46fc7d9c752a..1c9d67826186f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1485,7 +1485,9 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
   Register DstReg = MI.getOperand(0).getReg();
   LLT Ty = MRI.getType(DstReg);
 
+  const LLT S64 = LLT::scalar(64);
   const LLT S32 = LLT::scalar(32);
+  const LLT S16 = LLT::scalar(16);
 
   unsigned FirstOpnd = isa(MI) ? 2 : 1;
   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
@@ -1495,6 +1497,18 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
   const RegisterBank *DstBank =
 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   if (DstBank == &AMDGPU::VGPRRegBank) {
+if (Ty == S16) {
+  ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
+  B.setInsertPt(B.getMBB(), MI);
+  LegalizerHelper Helper(B.getMF(), ApplyBank, B);
+
+  Helper.widenScalarDst(MI, S32);
+  Helper.widenScalarSrc(MI, S32, 1, AMDGPU::G_ANYEXT);
+  Helper.widenScalarSrc(MI, S32, 2, AMDGPU::G_ZEXT);
+  Helper.widenScalarSrc(MI, S32, 3, AMDGPU::G_ZEXT);
+  return true;
+}
+
 if (Ty == S32)
   return true;
 
@@ -1554,6 +1568,11 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
 
   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
 
+  if (Ty == S16) {
+OffsetReg = B.buildAnyExtOrTrunc(S32, OffsetReg).getReg(0);
+WidthReg = B.buildAnyExtOrTrunc(S32, WidthReg).getReg(0);
+  }
+
   // Ensure the high bits are clear to insert the offset.
   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6));
   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
@@ -1568,13 +1587,21 @@ bool 
AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
 
   // TODO: It might be worth using a pseudo here to avoid scc clobber and
   // register class constraints.
-  unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
- (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+  unsigned Opc = (Ty != S64) ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
+ : (Signed ? AMDGPU::S_BFE_I64 : 
AMDGPU::S_BFE_U64);
 
-  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+  Register BFEDst = DstReg;
+  if (Ty == S16) {
+BFEDst = MRI.createGenericVirtualRegister(S32);
+MRI.setRegBank(BFEDst, AMDGPU::SGPRRegBank);
+  }
+  auto MIB = B.buildInstr(Opc, {BFEDst}, {SrcReg, MergedInputs});
   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
 llvm_unreachable("failed to constrain BFE");
 
+  if (BFEDst != DstReg)
+B.buildZExtOrTrunc(DstReg, BFEDst);
+
   MI.eraseFromParent();
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 07fcb02d98649..d2b600b04f9fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fsh

[llvm-branch-commits] [llvm] [AMDGPU][SIFoldOperands] Fold some redundant bitmasks (PR #131311)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131311

>From f3fddad8dca1e8ed327d7cc7cfee7a465032dcc4 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Fri, 14 Mar 2025 10:05:19 +0100
Subject: [PATCH 1/2] [AMDGPU][SIFoldOperands] Fold some redundant bitmasks

Instructions like shifts only read some of the bits of the shift amount 
operand, between 4 and 6 bits.
If the source operand is being masked, we can just ignore the mask.

Effects are minimal right now but this will kick in more once we disable 
uniform i16 operation widening in CGP.
With that disabled, we get more i16 shift amounts
that are zext'd and without this we'd end up with
more `s_and_b32 s1, s1, 0x` in the output.

Ideally ISel should handle this but it's proving difficult to get the patterns 
right, and after a few hours of trying I just decided to go with this as it's 
simple enough and it "just works" for this purpose.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp |  97 +++-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 201 -
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 207 --
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll|   6 +-
 llvm/test/CodeGen/AMDGPU/constrained-shift.ll |   1 -
 llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir |  26 +--
 8 files changed, 303 insertions(+), 251 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp 
b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index cc15dd7cb495c..5f666e10b5cb7 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -131,6 +131,7 @@ class SIFoldOperandsImpl {
   std::optional getImmOrMaterializedImm(MachineOperand &Op) const;
   bool tryConstantFoldOp(MachineInstr *MI) const;
   bool tryFoldCndMask(MachineInstr &MI) const;
+  bool tryFoldBitMask(MachineInstr &MI) const;
   bool tryFoldZeroHighBits(MachineInstr &MI) const;
   bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
 
@@ -1447,6 +1448,99 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr 
&MI) const {
   return true;
 }
 
+static bool getBitsReadByInst(unsigned Opc, unsigned &NumBitsRead,
+  unsigned &OpIdx) {
+  switch (Opc) {
+  case AMDGPU::V_ASHR_I32_e64:
+  case AMDGPU::V_ASHR_I32_e32:
+  case AMDGPU::V_LSHR_B32_e64:
+  case AMDGPU::V_LSHR_B32_e32:
+  case AMDGPU::V_LSHL_B32_e64:
+  case AMDGPU::V_LSHL_B32_e32:
+  case AMDGPU::S_LSHL_B32:
+  case AMDGPU::S_LSHR_B32:
+  case AMDGPU::S_ASHR_I32:
+NumBitsRead = 5;
+OpIdx = 2;
+return true;
+  case AMDGPU::S_LSHL_B64:
+  case AMDGPU::S_LSHR_B64:
+  case AMDGPU::S_ASHR_I64:
+NumBitsRead = 6;
+OpIdx = 2;
+return true;
+  case AMDGPU::V_LSHLREV_B32_e64:
+  case AMDGPU::V_LSHLREV_B32_e32:
+  case AMDGPU::V_LSHRREV_B32_e64:
+  case AMDGPU::V_LSHRREV_B32_e32:
+  case AMDGPU::V_ASHRREV_I32_e64:
+  case AMDGPU::V_ASHRREV_I32_e32:
+NumBitsRead = 5;
+OpIdx = 1;
+return true;
+  default:
+return false;
+  }
+}
+
+static bool isAndBitMaskRedundant(MachineInstr &MI, unsigned BitsNeeded,
+unsigned &SrcOp) {
+  MachineOperand *RegOp = &MI.getOperand(1);
+  MachineOperand *ImmOp = &MI.getOperand(2);
+
+  if (!RegOp->isReg() || !ImmOp->isImm()) {
+if (ImmOp->isReg() && RegOp->isImm())
+  std::swap(RegOp, ImmOp);
+else
+  return false;
+  }
+
+  SrcOp = RegOp->getOperandNo();
+
+  const unsigned BitMask = maskTrailingOnes(BitsNeeded);
+  return (ImmOp->getImm() & BitMask) == BitMask;
+}
+
+bool SIFoldOperandsImpl::tryFoldBitMask(MachineInstr &MI) const {
+  unsigned NumBitsRead = 0;
+  unsigned OpIdx = 0;
+  if (!getBitsReadByInst(MI.getOpcode(), NumBitsRead, OpIdx))
+return false;
+
+  MachineOperand &Op = MI.getOperand(OpIdx);
+  if (!Op.isReg())
+return false;
+
+  Register OpReg = Op.getReg();
+  if (OpReg.isPhysical())
+return false;
+
+  MachineInstr *OpDef = MRI->getVRegDef(OpReg);
+  if (!OpDef)
+return false ;
+
+  LLVM_DEBUG(dbgs() << "tryFoldBitMask: " << MI << "\tOpIdx:" << OpIdx << ", 
NumBitsRead:" << NumBitsRead << "\n");
+
+  unsigned ReplaceWith;
+  switch (OpDef->getOpcode()) {
+  // TODO: add more opcodes?
+  case AMDGPU::S_AND_B32:
+  case AMDGPU::V_AND_B32_e32:
+  case AMDGPU::V_AND_B32_e64:
+if (!isAndBitMaskRedundant(*OpDef, NumBitsRead, ReplaceWith))
+  return false;
+break;
+  default:
+return false;
+  }
+
+  MachineOperand &ReplaceWithOp = OpDef->getOperand(ReplaceWith);
+  LLVM_DEBUG(dbgs() << "\treplacing operand with:" << ReplaceWithOp << "\n");
+
+  MI.getOperand(OpIdx).setReg(ReplaceWithOp.getReg());
+  return true;
+}
+
 bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
   if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
   MI.getOpcode() != AMDGPU::V_AND_B32_e32)
@@ -1458,7 +1552,7 @@ bool SIFoldOperands

[llvm-branch-commits] [llvm] [AMDGPU] Precommit si-fold-bitmask.mir (PR #131310)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131310

>From 65d5012c30366cc713b793a30ab5119ddf8a77af Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Fri, 14 Mar 2025 10:00:21 +0100
Subject: [PATCH] [AMDGPU] Precommit si-fold-bitmask.mir

---
 llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir | 429 ++
 1 file changed, 429 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir

diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir 
b/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir
new file mode 100644
index 0..1edf970591179
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir
@@ -0,0 +1,429 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=si-fold-operands 
-verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s
+
+# Test supported instructions
+
+---
+name: v_ashr_i32_e64__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_ashr_i32_e64__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_ASHR_I32_e64 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_ASHR_I32_e64 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: v_lshr_b32_e64__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_lshr_b32_e64__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_LSHR_B32_e64 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_LSHR_B32_e64 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: v_lshr_b32_e32__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_lshr_b32_e32__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_LSHR_B32_e32 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_LSHR_B32_e32 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: v_lshl_b32_e64__v_and_b32_e32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0, $vgpr1
+
+; GCN-LABEL: name: v_lshl_b32_e64__v_and_b32_e32
+; GCN: liveins: $vgpr0, $vgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit 
$exec
+; GCN-NEXT: %ret:vgpr_32 = V_LSHL_B32_e64 %src, %shiftmask, implicit $exec
+; GCN-NEXT: $vgpr0 = COPY %ret
+%src:vgpr_32 = COPY $vgpr0
+%shift:vgpr_32 = COPY $vgpr1
+%shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit $exec
+%ret:vgpr_32 = V_LSHL_B32_e64 %src, %shiftmask, implicit $exec
+$vgpr0 = COPY %ret
+...
+
+---
+name: s_lshl_b32__s_and_b32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $sgpr0, $sgpr1
+
+; GCN-LABEL: name: s_lshl_b32__s_and_b32
+; GCN: liveins: $sgpr0, $sgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:sgpr_32 = COPY $sgpr0
+; GCN-NEXT: %shift:sgpr_32 = COPY $sgpr1
+; GCN-NEXT: %shiftmask:sgpr_32 = S_AND_B32 65535, %shift, implicit-def $scc
+; GCN-NEXT: %ret:sgpr_32 = S_LSHL_B32 %src, %shiftmask, implicit-def $scc
+; GCN-NEXT: $sgpr0 = COPY %ret
+%src:sgpr_32 = COPY $sgpr0
+%shift:sgpr_32 = COPY $sgpr1
+%shiftmask:sgpr_32 = S_AND_B32 65535, %shift, implicit-def $scc
+%ret:sgpr_32 = S_LSHL_B32 %src, %shiftmask, implicit-def $scc
+$sgpr0 = COPY %ret
+...
+
+---
+name: s_lshr_b32__s_and_b32
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $sgpr0, $sgpr1
+
+; GCN-LABEL: name: s_lshr_b32__s_and_b32
+; GCN: liveins: $sgpr0, $sgpr1
+; GCN-NEXT: {{  $}}
+; GCN-NEXT: %src:sgpr_32 = COPY $sgpr0
+; GCN-NEXT: %shift:sgpr_

[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Combine (sext (trunc (sext_in_reg x))) (PR #131312)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/131312

>From 782153a9a47d4a0fdb897e811033179fa67c5060 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Fri, 14 Mar 2025 10:34:51 +0100
Subject: [PATCH] [AMDGPU][GlobalISel] Combine (sext (trunc (sext_in_reg x)))

This is a bit of an akward pattern that can come up as a result
of legalization and then widening of i16 operations to i32 in RegBankSelect
on AMDGPU.

This quick combine avoids redundant patterns like
```
s_sext_i32_i8 s0, s0
s_sext_i32_i16 s0, s0
s_ashr_i32 s0, s0, s1
```

With this the second sext is removed as it's redundant.
---
 .../include/llvm/Target/GlobalISel/Combine.td | 12 ++-
 .../combine-sext-trunc-sextinreg.mir  | 86 +++
 .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll | 78 -
 3 files changed, 113 insertions(+), 63 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir

diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td 
b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 3590ab221ad44..9727b86b4be8b 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -258,6 +258,14 @@ def sext_trunc_sextload : GICombineRule<
  [{ return Helper.matchSextTruncSextLoad(*${d}); }]),
   (apply [{ Helper.applySextTruncSextLoad(*${d}); }])>;
 
+def sext_trunc_sextinreg : GICombineRule<
+  (defs root:$dst),
+  (match (G_SEXT_INREG $sir, $src, $width),
+ (G_TRUNC $trunc, $sir),
+ (G_SEXT $dst, $trunc),
+ [{ return (MRI.getType(${trunc}.getReg()).getScalarSizeInBits() >= 
${width}.getImm()); }]),
+  (apply (GIReplaceReg $dst, $sir))>;
+
 def sext_inreg_of_load_matchdata : GIDefMatchData<"std::tuple">;
 def sext_inreg_of_load : GICombineRule<
   (defs root:$root, sext_inreg_of_load_matchdata:$matchinfo),
@@ -1896,7 +1904,9 @@ def cast_of_cast_combines: GICombineGroup<[
   sext_of_anyext,
   anyext_of_anyext,
   anyext_of_zext,
-  anyext_of_sext
+  anyext_of_sext,
+
+  sext_trunc_sextinreg
 ]>;
 
 def cast_combines: GICombineGroup<[
diff --git 
a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir
new file mode 100644
index 0..d41e5b172efc2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir
@@ -0,0 +1,86 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 
-run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | 
FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 
-run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: trunc_s16_inreg_8
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: trunc_s16_inreg_8
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
+; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 8
+%trunc:_(s16) = G_TRUNC %inreg
+%sext:_(s32) = G_SEXT %trunc
+$vgpr0 = COPY %sext
+...
+
+---
+name: trunc_s16_inreg_16
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: trunc_s16_inreg_16
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 16
+; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 16
+%trunc:_(s16) = G_TRUNC %inreg
+%sext:_(s32) = G_SEXT %trunc
+$vgpr0 = COPY %sext
+...
+
+---
+name: trunc_s8_inreg_16
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: trunc_s8_inreg_16
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 16
+; CHECK-NEXT: %trunc:_(s8) = G_TRUNC %inreg(s32)
+; CHECK-NEXT: %sext:_(s32) = G_SEXT %trunc(s8)
+; CHECK-NEXT: $vgpr0 = COPY %sext(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 16
+%trunc:_(s8) = G_TRUNC %inreg
+%sext:_(s32) = G_SEXT %trunc
+$vgpr0 = COPY %sext
+...
+
+# TODO?: We could handle this by inserting a trunc, but I'm not sure how 
useful that'd be.
+---
+name: mismatching_types
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: mismatching_types
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
+; CHECK-NEXT: %trunc:_(s8) = G_TRUNC %inreg(s32)
+; CHECK-NEXT: %sext:_(s16

[llvm-branch-commits] [flang] [flang][OpenMP] Enable delayed privatization by default for `omp.distribute` (PR #131574)

2025-03-17 Thread Kareem Ergawy via llvm-branch-commits

https://github.com/ergawy updated 
https://github.com/llvm/llvm-project/pull/131574

>From 047e4b2327da0ee0ad0398fe8d3c799cefc3f7d8 Mon Sep 17 00:00:00 2001
From: ergawy 
Date: Mon, 17 Mar 2025 02:08:23 -0500
Subject: [PATCH] [flang][OpenMP] Enable delayed privatization by default for
 `omp.distribute`

Switches delayed privatization for `omp.distribute` to be on by default:
controlled by the `-openmp-enable-delayed-privatization` instead of by
`-openmp-enable-delayed-privatization-staging`
---
 flang/lib/Lower/OpenMP/OpenMP.cpp| 2 +-
 flang/test/Lower/OpenMP/distribute.f90   | 2 +-
 flang/test/Lower/OpenMP/order-clause.f90 | 6 +++---
 flang/test/Transforms/stack-arrays-hlfir.f90 | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp 
b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 2cfc1bd88dcef..f753ce1e82288 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2549,7 +2549,7 @@ static void 
genStandaloneDistribute(lower::AbstractConverter &converter,
 
   DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
/*shouldCollectPreDeterminedSymbols=*/true,
-   enableDelayedPrivatizationStaging, symTable);
+   enableDelayedPrivatization, symTable);
   dsp.processStep1(&distributeClauseOps);
 
   mlir::omp::LoopNestOperands loopNestClauseOps;
diff --git a/flang/test/Lower/OpenMP/distribute.f90 
b/flang/test/Lower/OpenMP/distribute.f90
index a4a753dddbac4..ea57d35b964b4 100644
--- a/flang/test/Lower/OpenMP/distribute.f90
+++ b/flang/test/Lower/OpenMP/distribute.f90
@@ -7,7 +7,7 @@ subroutine distribute_simple()
   ! CHECK: omp.teams
   !$omp teams
 
-  ! CHECK: omp.distribute {
+  ! CHECK: omp.distribute private({{.*}}) {
   !$omp distribute
 
   ! CHECK-NEXT: omp.loop_nest
diff --git a/flang/test/Lower/OpenMP/order-clause.f90 
b/flang/test/Lower/OpenMP/order-clause.f90
index 1f678e02708da..d5799079b3759 100644
--- a/flang/test/Lower/OpenMP/order-clause.f90
+++ b/flang/test/Lower/OpenMP/order-clause.f90
@@ -61,15 +61,15 @@ end subroutine do_simd_order_parallel
 
 
 subroutine distribute_order
-   !CHECK: omp.distribute order(reproducible:concurrent) {
+   !CHECK: omp.distribute order(reproducible:concurrent) private({{.*}}) {
!$omp teams distribute order(concurrent)
do i=1,10
end do
-   !CHECK: omp.distribute order(reproducible:concurrent) {
+   !CHECK: omp.distribute order(reproducible:concurrent) private({{.*}}) {
!$omp teams distribute order(reproducible:concurrent)
do i=1,10
end do
-   !CHECK: omp.distribute order(unconstrained:concurrent) {
+   !CHECK: omp.distribute order(unconstrained:concurrent) private({{.*}}) {
!$omp teams distribute order(unconstrained:concurrent)
do i = 1, 10
end do
diff --git a/flang/test/Transforms/stack-arrays-hlfir.f90 
b/flang/test/Transforms/stack-arrays-hlfir.f90
index 06749b7ca88af..e70a1d9b89216 100644
--- a/flang/test/Transforms/stack-arrays-hlfir.f90
+++ b/flang/test/Transforms/stack-arrays-hlfir.f90
@@ -73,7 +73,7 @@ end subroutine omp_target_wsloop
 ! CHECK-NOT:   fir.freemem
 ! CHECK: omp.teams {
 ! CHECK:   fir.alloca !fir.array<2xi64>
-! CHECK: omp.distribute {
+! CHECK: omp.distribute private({{.*}}) {
 ! CHECK: omp.loop_nest {{.*}} {
 ! CHECK-NOT:   fir.allocmem
 ! CHECK-NOT:   fir.freemem

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [OpenMP][MLIR] Refactor code related to collecting privatizer info into a shared util (PR #131582)

2025-03-17 Thread Kareem Ergawy via llvm-branch-commits

https://github.com/ergawy updated 
https://github.com/llvm/llvm-project/pull/131582

>From 1b934bafb44a4e5e07f02dd3daab88bbd27017c3 Mon Sep 17 00:00:00 2001
From: ergawy 
Date: Mon, 17 Mar 2025 03:37:00 -0500
Subject: [PATCH] [OpenMP][MLIR] Refactor code related to collecting privatizer
 info into a shared util

Moves code needed to collect info about delayed privatizers into a
shared util instread of repeating the same patter across all relevant
constructs.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp  | 251 --
 1 file changed, 104 insertions(+), 147 deletions(-)

diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 537558a83cb36..a513dfc4feb09 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -696,20 +696,42 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase 
&builder,
   return success();
 }
 
-/// Populates `privatizations` with privatization declarations used for the
-/// given op.
-template 
-static void collectPrivatizationDecls(
-OP op, SmallVectorImpl &privatizations) {
-  std::optional attr = op.getPrivateSyms();
-  if (!attr)
-return;
+/// A util to collect info needed to convert delayed privatizers from MLIR to
+/// LLVM.
+struct PrivateVarsInfo {
+  template 
+  PrivateVarsInfo(OP op)
+  : privateBlockArgs(
+cast(*op).getPrivateBlockArgs()) {
+mlirPrivateVars.reserve(privateBlockArgs.size());
+llvmPrivateVars.reserve(privateBlockArgs.size());
+collectPrivatizationDecls(op, privateDecls);
 
-  privatizations.reserve(privatizations.size() + attr->size());
-  for (auto symbolRef : attr->getAsRange()) {
-privatizations.push_back(findPrivatizer(op, symbolRef));
+for (mlir::Value privateVar : op.getPrivateVars())
+  mlirPrivateVars.push_back(privateVar);
   }
-}
+
+  MutableArrayRef privateBlockArgs;
+  SmallVector mlirPrivateVars;
+  SmallVector llvmPrivateVars;
+  SmallVector privateDecls;
+
+private:
+  /// Populates `privatizations` with privatization declarations used for the
+  /// given op.
+  template 
+  static void collectPrivatizationDecls(
+  OP op, SmallVectorImpl &privatizations) {
+std::optional attr = op.getPrivateSyms();
+if (!attr)
+  return;
+
+privatizations.reserve(privatizations.size() + attr->size());
+for (auto symbolRef : attr->getAsRange()) {
+  privatizations.push_back(findPrivatizer(op, symbolRef));
+}
+  }
+};
 
 /// Populates `reductions` with reduction declarations used in the given op.
 template 
@@ -1384,19 +1406,18 @@ static llvm::Expected initPrivateVar(
 static llvm::Error
 initPrivateVars(llvm::IRBuilderBase &builder,
 LLVM::ModuleTranslation &moduleTranslation,
-MutableArrayRef privateBlockArgs,
-MutableArrayRef privateDecls,
-MutableArrayRef mlirPrivateVars,
-llvm::SmallVectorImpl &llvmPrivateVars,
+PrivateVarsInfo &privateVarsInfo,
 llvm::DenseMap *mappedPrivateVars = nullptr) {
-  if (privateBlockArgs.empty())
+  if (privateVarsInfo.privateBlockArgs.empty())
 return llvm::Error::success();
 
   llvm::BasicBlock *privInitBlock = splitBB(builder, true, "omp.private.init");
   setInsertPointForPossiblyEmptyBlock(builder, privInitBlock);
 
   for (auto [idx, zip] : llvm::enumerate(llvm::zip_equal(
-   privateDecls, mlirPrivateVars, privateBlockArgs, llvmPrivateVars))) 
{
+   privateVarsInfo.privateDecls, privateVarsInfo.mlirPrivateVars,
+   privateVarsInfo.privateBlockArgs,
+   privateVarsInfo.llvmPrivateVars))) {
 auto [privDecl, mlirPrivVar, blockArg, llvmPrivateVar] = zip;
 llvm::Expected privVarOrErr = initPrivateVar(
 builder, moduleTranslation, privDecl, mlirPrivVar, blockArg,
@@ -1420,10 +1441,7 @@ initPrivateVars(llvm::IRBuilderBase &builder,
 static llvm::Expected
 allocatePrivateVars(llvm::IRBuilderBase &builder,
 LLVM::ModuleTranslation &moduleTranslation,
-MutableArrayRef privateBlockArgs,
-MutableArrayRef privateDecls,
-MutableArrayRef mlirPrivateVars,
-llvm::SmallVectorImpl &llvmPrivateVars,
+PrivateVarsInfo &privateVarsInfo,
 const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
 llvm::DenseMap *mappedPrivateVars = nullptr) 
{
   // Allocate private vars
@@ -1449,8 +1467,9 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
->getDataLayout()
.getProgramAddressSpace();
 
-  for (auto [privDecl, mlirPrivVar, blockArg] :
-   llvm::zip_equal(privateDecls, mlirPrivateVars, privateBlockArgs)) {
+  for (auto [privDecl, mlirPrivVar, blockArg] : llvm::

[llvm-branch-commits] [llvm] [llvm] Extract and propagate indirect call type id (PR #87575)

2025-03-17 Thread Matt Arsenault via llvm-branch-commits


@@ -0,0 +1,102 @@
+;; Test MIR printer and parser for type id field in call site info. Test that
+;; it works well with/without --emit-call-site-info.
+
+;; Multiplex --call-graph-section and -emit-call-site-info as both utilize
+;; CallSiteInfo and callSites.
+
+
+;; Test printer and parser with --call-graph-section only.
+
+;; Test printer.
+;; Verify that fwdArgRegs is not set, typeId is set.
+;; Verify the exact typeId value to ensure it is not garbage but the value
+;; computed as the type id from the type operand bundle.
+; RUN: llc --call-graph-section %s -stop-before=finalize-isel -o %t1.mir
+; RUN: cat %t1.mir | FileCheck %s --check-prefix=PRINTER_CGS
+; PRINTER_CGS: name: main
+; PRINTER_CGS: callSites:
+; PRINTER_CGS-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [], typeId:
+; PRINTER_CGS-NEXT: 7854600665770582568 }
+
+
+;; Test parser.
+;; Verify that we get the same result.
+; RUN: llc --call-graph-section %t1.mir -run-pass=finalize-isel -o - \
+; RUN: | FileCheck %s --check-prefix=PARSER_CGS
+; PARSER_CGS: name: main
+; PARSER_CGS: callSites:
+; PARSER_CGS-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [], typeId:
+; PARSER_CGS-NEXT: 7854600665770582568 }
+
+
+;; Test printer and parser with -emit-call-site-info only.
+
+;; Test printer.
+;; Verify that fwdArgRegs is set, typeId is not set.
+; RUN: llc -emit-call-site-info %s -stop-before=finalize-isel -o %t2.mir
+; RUN: cat %t2.mir | FileCheck %s --check-prefix=PRINTER_CSI
+; PRINTER_CSI: name: main
+; PRINTER_CSI: callSites:
+; PRINTER_CSI-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs:
+; PRINTER_CSI-NEXT: { arg: 0, reg: '$edi' }
+; PRINTER_CSI-NOT: typeId:
+
+
+;; Test parser.
+;; Verify that we get the same result.
+; RUN: llc -emit-call-site-info %t2.mir -run-pass=finalize-isel -o - \
+; RUN: | FileCheck %s --check-prefix=PARSER_CSI
+; PARSER_CSI: name: main
+; PARSER_CSI: callSites:
+; PARSER_CSI-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs:
+; PARSER_CSI-NEXT: { arg: 0, reg: '$edi' }
+; PARSER_CSI-NOT: typeId:
+
+
+;; Test printer and parser with both -emit-call-site-info and 
--call-graph-section.
+
+;; Test printer.
+;; Verify both fwdArgRegs and typeId are set.
+;; Verify the exact typeId value to ensure it is not garbage but the value
+;; computed as the type id from the type operand bundle.
+; RUN: llc --call-graph-section -emit-call-site-info %s 
-stop-before=finalize-isel -o %t2.mir
+; RUN: cat %t2.mir | FileCheck %s --check-prefix=PRINTER_CGS_CSI
+; PRINTER_CGS_CSI: name: main
+; PRINTER_CGS_CSI: callSites:
+; PRINTER_CGS_CSI-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs:
+; PRINTER_CGS_CSI-NEXT: { arg: 0, reg: '$edi' }, typeId:
+; PRINTER_CGS_CSI-NEXT:   7854600665770582568 }
+
+
+;; Test parser.
+;; Verify that we get the same result.
+; RUN: llc --call-graph-section -emit-call-site-info %t2.mir 
-run-pass=finalize-isel -o - \
+; RUN: | FileCheck %s --check-prefix=PARSER_CGS_CSI
+; PARSER_CGS_CSI: name: main
+; PARSER_CGS_CSI: callSites:
+; PARSER_CGS_CSI-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs:
+; PARSER_CGS_CSI-NEXT: { arg: 0, reg: '$edi' }, typeId:
+; PARSER_CGS_CSI-NEXT:   7854600665770582568 }
+
+
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @foo(i8 signext %a) !type !3 {
+entry:
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @main() !type !4 {
+entry:
+  %retval = alloca i32, align 4
+  %fp = alloca ptr, align 8
+  store i32 0, ptr %retval, align 4
+  store ptr @foo, ptr %fp, align 8
+  %0 = load ptr, ptr %fp, align 8

arsenm wrote:

Use named values in tests 

https://github.com/llvm/llvm-project/pull/87575
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [libcxx] 5c1188d - Revert "[libc++] Optimize num_put integral functions (#120859)"

2025-03-17 Thread via llvm-branch-commits

Author: Alexander Kornienko
Date: 2025-03-17T14:38:37+01:00
New Revision: 5c1188dfb27c3499f58849bde211ac03026d90d1

URL: 
https://github.com/llvm/llvm-project/commit/5c1188dfb27c3499f58849bde211ac03026d90d1
DIFF: 
https://github.com/llvm/llvm-project/commit/5c1188dfb27c3499f58849bde211ac03026d90d1.diff

LOG: Revert "[libc++] Optimize num_put integral functions (#120859)"

This reverts commit 15edf8725a8044e5cb681a5773e0ada1249690cb.

Added: 


Modified: 
libcxx/docs/ReleaseNotes/20.rst
libcxx/include/__charconv/tables.h
libcxx/include/__charconv/to_chars_base_10.h
libcxx/include/__charconv/to_chars_integral.h
libcxx/include/__charconv/to_chars_result.h
libcxx/include/__charconv/traits.h
libcxx/include/__format/formatter_floating_point.h
libcxx/include/__format/formatter_integral.h
libcxx/include/__format/formatter_output.h
libcxx/include/locale
libcxx/include/module.modulemap

libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp

Removed: 
libcxx/test/benchmarks/locale/num_put.bench.cpp



diff  --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index c7b59545b4fbc..57ab0c167544b 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -120,8 +120,6 @@ Improvements and New Features
 
 - Added :ref:`hardening mode ` support for ``forward_list`` and 
``bitset``.
 
-- The ``num_get::do_put`` integral overloads have been optimized, resulting in 
a performance improvement of up to 2.4x.
-
 Deprecations and Removals
 -
 

diff  --git a/libcxx/include/__charconv/tables.h 
b/libcxx/include/__charconv/tables.h
index b8c6fd8af0a0f..9568bf841cd02 100644
--- a/libcxx/include/__charconv/tables.h
+++ b/libcxx/include/__charconv/tables.h
@@ -19,14 +19,16 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+#if _LIBCPP_STD_VER >= 17
+
 namespace __itoa {
 
-inline _LIBCPP_CONSTEXPR const char __base_2_lut[64] = {
+inline constexpr char __base_2_lut[64] = {
 '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '0', '1', 
'1', '0', '1', '0', '0', '0', '1',
 '0', '1', '0', '1', '1', '0', '0', '1', '1', '1', '1', '0', '0', '0', '1', 
'0', '0', '1', '1', '0', '1', '0',
 '1', '0', '1', '1', '1', '1', '0', '0', '1', '1', '0', '1', '1', '1', '1', 
'0', '1', '1', '1', '1'};
 
-inline _LIBCPP_CONSTEXPR const char __base_8_lut[128] = {
+inline constexpr char __base_8_lut[128] = {
 '0', '0', '0', '1', '0', '2', '0', '3', '0', '4', '0', '5', '0', '6', '0', 
'7', '1', '0', '1', '1', '1', '2',
 '1', '3', '1', '4', '1', '5', '1', '6', '1', '7', '2', '0', '2', '1', '2', 
'2', '2', '3', '2', '4', '2', '5',
 '2', '6', '2', '7', '3', '0', '3', '1', '3', '2', '3', '3', '3', '4', '3', 
'5', '3', '6', '3', '7', '4', '0',
@@ -34,7 +36,7 @@ inline _LIBCPP_CONSTEXPR const char __base_8_lut[128] = {
 '5', '4', '5', '5', '5', '6', '5', '7', '6', '0', '6', '1', '6', '2', '6', 
'3', '6', '4', '6', '5', '6', '6',
 '6', '7', '7', '0', '7', '1', '7', '2', '7', '3', '7', '4', '7', '5', '7', 
'6', '7', '7'};
 
-inline _LIBCPP_CONSTEXPR const char __base_16_lut[512] = {
+inline constexpr char __base_16_lut[512] = {
 '0', '0', '0', '1', '0', '2', '0', '3', '0', '4', '0', '5', '0', '6', '0', 
'7', '0', '8', '0', '9', '0', 'a', '0',
 'b', '0', 'c', '0', 'd', '0', 'e', '0', 'f', '1', '0', '1', '1', '1', '2', 
'1', '3', '1', '4', '1', '5', '1', '6',
 '1', '7', '1', '8', '1', '9', '1', 'a', '1', 'b', '1', 'c', '1', 'd', '1', 
'e', '1', 'f', '2', '0', '2', '1', '2',
@@ -59,7 +61,7 @@ inline _LIBCPP_CONSTEXPR const char __base_16_lut[512] = {
 '1', 'f', '2', 'f', '3', 'f', '4', 'f', '5', 'f', '6', 'f', '7', 'f', '8', 
'f', '9', 'f', 'a', 'f', 'b', 'f', 'c',
 'f', 'd', 'f', 'e', 'f', 'f'};
 
-inline _LIBCPP_CONSTEXPR const uint32_t __pow10_32[10] = {
+inline constexpr uint32_t __pow10_32[10] = {
 UINT32_C(0),
 UINT32_C(10),
 UINT32_C(100),
@@ -71,7 +73,7 @@ inline _LIBCPP_CONSTEXPR const uint32_t __pow10_32[10] = {
 UINT32_C(1),
 UINT32_C(10)};
 
-inline _LIBCPP_CONSTEXPR const uint64_t __pow10_64[20] = {
+inline constexpr uint64_t __pow10_64[20] = {
 UINT64_C(0),
 UINT64_C(10),
 UINT64_C(100),
@@ -94,8 +96,8 @@ inline _LIBCPP_CONSTEXPR const uint64_t __pow10_64[20] = {
 UINT64_C(1000)};
 
 #  if _LIBCPP_HAS_INT128
-inline _LIBCPP_CONSTEXPR const int __pow10_128_offset  = 0;
-inline _LIBCPP_CONSTEXPR const __uint128_t __pow10_128[40] = {
+inline constexpr int __pow10_128_offset  = 0;
+inline constexpr __uint128_t __pow10_128[40] = {
 UINT64_C(0),
 UINT64_C(10),
 UINT64_C(100),
@@ -138,7 +140,7 @@ inline _LIBCPP_CONSTEXPR const __uint128_t __pow10_128[40] 
= {
 (__uint128_t(UINT64_C(1000)) * 
UINT64_C(1000)) * 10};
 #  end

[llvm-branch-commits] [llvm] [GlobalISel] Combine redundant sext_inreg (PR #131624)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh created 
https://github.com/llvm/llvm-project/pull/131624

None

>From e36f66595a582b6ba926186674b6da6b41236ff5 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Mon, 17 Mar 2025 13:54:59 +0100
Subject: [PATCH] [GlobalISel] Combine redundant sext_inreg

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   3 +
 .../include/llvm/Target/GlobalISel/Combine.td |   9 +-
 .../GlobalISel/CombinerHelperCasts.cpp|  27 +++
 .../combine-redundant-sext-inreg.mir  | 164 ++
 .../combine-sext-trunc-sextinreg.mir  |  87 ++
 .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll |   5 -
 6 files changed, 289 insertions(+), 6 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-sext-inreg.mir
 create mode 100644 
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h 
b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9b78342c8fc39..5778377d125a8 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -994,6 +994,9 @@ class CombinerHelper {
   // overflow sub
   bool matchSuboCarryOut(const MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
+  // (sext_inreg (sext_inreg x, K0), K1)
+  void applyRedundantSextInReg(MachineInstr &Root, MachineInstr &Other) const;
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td 
b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 660b03080f92e..6a0ff683a4647 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1849,6 +1849,12 @@ def anyext_of_anyext : ext_of_ext_opcodes;
 def anyext_of_zext : ext_of_ext_opcodes;
 def anyext_of_sext : ext_of_ext_opcodes;
 
+def sext_inreg_of_sext_inreg : GICombineRule<
+   (defs root:$dst),
+   (match (G_SEXT_INREG $x, $src, $a):$other,
+  (G_SEXT_INREG $dst, $x, $b):$root),
+   (apply [{ Helper.applyRedundantSextInReg(*${root}, *${other}); }])>;
+
 // Push cast through build vector.
 class buildvector_of_opcode : GICombineRule <
   (defs root:$root, build_fn_matchinfo:$matchinfo),
@@ -1896,7 +1902,8 @@ def cast_of_cast_combines: GICombineGroup<[
   sext_of_anyext,
   anyext_of_anyext,
   anyext_of_zext,
-  anyext_of_sext
+  anyext_of_sext,
+  sext_inreg_of_sext_inreg,
 ]>;
 
 def cast_combines: GICombineGroup<[
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp 
b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
index 182484754d091..ffc2384fc14fd 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -372,3 +372,30 @@ bool CombinerHelper::matchCastOfInteger(const MachineInstr 
&CastMI,
 return false;
   }
 }
+
+void CombinerHelper::applyRedundantSextInReg(MachineInstr &Root,
+ MachineInstr &Other) const {
+  assert(Root.getOpcode() == TargetOpcode::G_SEXT_INREG &&
+ Other.getOpcode() == TargetOpcode::G_SEXT_INREG);
+
+  unsigned RootWidth = Root.getOperand(2).getImm();
+  unsigned OtherWidth = Other.getOperand(2).getImm();
+
+  Register Dst = Root.getOperand(0).getReg();
+  Register OtherDst = Other.getOperand(0).getReg();
+  Register Src = Other.getOperand(1).getReg();
+
+  if (RootWidth >= OtherWidth) {
+// The root sext_inreg is entirely redundant because the other one
+// is narrower.
+Observer.changingAllUsesOfReg(MRI, Dst);
+MRI.replaceRegWith(Dst, OtherDst);
+Observer.finishedChangingAllUsesOfReg();
+  } else {
+// RootWidth < OtherWidth, rewrite this G_SEXT_INREG with the source of the
+// other G_SEXT_INREG.
+Builder.buildSExtInReg(Dst, Src, RootWidth);
+  }
+
+  Root.eraseFromParent();
+}
diff --git 
a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-sext-inreg.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-sext-inreg.mir
new file mode 100644
index 0..566ee8e6c338d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-sext-inreg.mir
@@ -0,0 +1,164 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 
-run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: inreg8_inreg16
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: inreg8_inreg16
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
+; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 8
+%inreg1:_(s32) = G_SEXT_INREG %inreg, 16
+$vgpr0 = COPY %inreg1
+...
+

[llvm-branch-commits] [llvm] [AMDGPU] Add sext_trunc in RegBankCombiner (PR #131623)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh created 
https://github.com/llvm/llvm-project/pull/131623

None

>From 3f2cbbd6addf4844c7c861a6de55be59a8c96c35 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Mon, 17 Mar 2025 13:22:25 +0100
Subject: [PATCH] [AMDGPU] Add sext_trunc in RegBankCombiner

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index a21505356274b..083ce48911689 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -181,5 +181,5 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
-   cast_of_cast_combines]> {
+   cast_of_cast_combines, sext_trunc]> {
 }

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Add sext_trunc in RegBankCombiner (PR #131623)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh ready_for_review 
https://github.com/llvm/llvm-project/pull/131623
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [GlobalISel] Combine redundant sext_inreg (PR #131624)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh ready_for_review 
https://github.com/llvm/llvm-project/pull/131624
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Add sext_trunc in RegBankCombiner (PR #131623)

2025-03-17 Thread Pierre van Houtryve via llvm-branch-commits

Pierre-vh wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/131623?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#131624** https://app.graphite.dev/github/pr/llvm/llvm-project/131624?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#131623** https://app.graphite.dev/github/pr/llvm/llvm-project/131623?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/131623?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#131622** https://app.graphite.dev/github/pr/llvm/llvm-project/131622?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/131623
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [GlobalISel] Combine redundant sext_inreg (PR #131624)

2025-03-17 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)


Changes



---
Full diff: https://github.com/llvm/llvm-project/pull/131624.diff


6 Files Affected:

- (modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+3) 
- (modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+8-1) 
- (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp (+27) 
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-sext-inreg.mir 
(+164) 
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir 
(+87) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll (-5) 


``diff
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h 
b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9b78342c8fc39..5778377d125a8 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -994,6 +994,9 @@ class CombinerHelper {
   // overflow sub
   bool matchSuboCarryOut(const MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
+  // (sext_inreg (sext_inreg x, K0), K1)
+  void applyRedundantSextInReg(MachineInstr &Root, MachineInstr &Other) const;
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td 
b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 660b03080f92e..6a0ff683a4647 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1849,6 +1849,12 @@ def anyext_of_anyext : ext_of_ext_opcodes;
 def anyext_of_zext : ext_of_ext_opcodes;
 def anyext_of_sext : ext_of_ext_opcodes;
 
+def sext_inreg_of_sext_inreg : GICombineRule<
+   (defs root:$dst),
+   (match (G_SEXT_INREG $x, $src, $a):$other,
+  (G_SEXT_INREG $dst, $x, $b):$root),
+   (apply [{ Helper.applyRedundantSextInReg(*${root}, *${other}); }])>;
+
 // Push cast through build vector.
 class buildvector_of_opcode : GICombineRule <
   (defs root:$root, build_fn_matchinfo:$matchinfo),
@@ -1896,7 +1902,8 @@ def cast_of_cast_combines: GICombineGroup<[
   sext_of_anyext,
   anyext_of_anyext,
   anyext_of_zext,
-  anyext_of_sext
+  anyext_of_sext,
+  sext_inreg_of_sext_inreg,
 ]>;
 
 def cast_combines: GICombineGroup<[
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp 
b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
index 182484754d091..ffc2384fc14fd 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -372,3 +372,30 @@ bool CombinerHelper::matchCastOfInteger(const MachineInstr 
&CastMI,
 return false;
   }
 }
+
+void CombinerHelper::applyRedundantSextInReg(MachineInstr &Root,
+ MachineInstr &Other) const {
+  assert(Root.getOpcode() == TargetOpcode::G_SEXT_INREG &&
+ Other.getOpcode() == TargetOpcode::G_SEXT_INREG);
+
+  unsigned RootWidth = Root.getOperand(2).getImm();
+  unsigned OtherWidth = Other.getOperand(2).getImm();
+
+  Register Dst = Root.getOperand(0).getReg();
+  Register OtherDst = Other.getOperand(0).getReg();
+  Register Src = Other.getOperand(1).getReg();
+
+  if (RootWidth >= OtherWidth) {
+// The root sext_inreg is entirely redundant because the other one
+// is narrower.
+Observer.changingAllUsesOfReg(MRI, Dst);
+MRI.replaceRegWith(Dst, OtherDst);
+Observer.finishedChangingAllUsesOfReg();
+  } else {
+// RootWidth < OtherWidth, rewrite this G_SEXT_INREG with the source of the
+// other G_SEXT_INREG.
+Builder.buildSExtInReg(Dst, Src, RootWidth);
+  }
+
+  Root.eraseFromParent();
+}
diff --git 
a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-sext-inreg.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-sext-inreg.mir
new file mode 100644
index 0..566ee8e6c338d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-sext-inreg.mir
@@ -0,0 +1,164 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 
-run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: inreg8_inreg16
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: inreg8_inreg16
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
+; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+%copy:_(s32) = COPY $vgpr0
+%inreg:_(s32) = G_SEXT_INREG %copy, 8
+%inreg1:_(s32) = G_SEXT_INREG %inreg, 16
+$vgpr0 = COPY %inreg1
+...
+
+---
+name: inreg16_inreg16
+tracksRegLiveness: true
+body: |
+  bb.0:
+liveins: $vgpr0
+; CHECK-LABEL: name: inreg16_inreg16
+; CHECK: liveins: $vgpr0
+; CHECK-NEXT: {{  $}}
+

  1   2   >