[llvm-branch-commits] [llvm] [AMDGPU][NPM] Complete optimized regalloc pipeline (PR #138491)

2025-07-07 Thread Akshat Oke via llvm-branch-commits


@@ -2174,7 +2174,44 @@ void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
   addPass(SIShrinkInstructionsPass());
 }
 
+void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
+AddMachinePass &addPass) const {
+  if (EnableDCEInRA)
+insertPass(DeadMachineInstructionElimPass());
+
+  // FIXME: when an instruction has a Killed operand, and the instruction is
+  // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
+  // the register in LiveVariables, this would trigger a failure in verifier,
+  // we should fix it and enable the verifier.
+  if (OptVGPRLiveRange)
+insertPass>(

optimisan wrote:

Since we are moving to the callback style TargetPassBuilder design, I am 
keeping this the same as legacy.

https://github.com/llvm/llvm-project/pull/138491
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][NPM] Complete optimized regalloc pipeline (PR #138491)

2025-07-07 Thread Akshat Oke via llvm-branch-commits

https://github.com/optimisan updated 
https://github.com/llvm/llvm-project/pull/138491

>From 3d1996a1d347eb14d6908d789307c0a3eef0568c Mon Sep 17 00:00:00 2001
From: Akshat Oke 
Date: Mon, 5 May 2025 06:30:03 +
Subject: [PATCH] [AMDGPU][NPM] Complete optimized regalloc pipeline

Also fill in some other passes.
---
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |  2 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 41 +--
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h  |  1 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  |  7 +++-
 4 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h 
b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index a3b19af4adc39..29bc432ba3d5d 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -574,7 +574,7 @@ template  class 
CodeGenPassBuilder {
   /// Insert InsertedPass pass after TargetPass pass.
   /// Only machine function passes are supported.
   template 
-  void insertPass(InsertedPassT &&Pass) {
+  void insertPass(InsertedPassT &&Pass) const {
 AfterCallbacks.emplace_back(
 [&](StringRef Name, MachineFunctionPassManager &MFPM) mutable {
   if (Name == TargetPassT::name())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 09b40c9173ff6..3f325398752a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -2192,7 +2192,44 @@ void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
   addPass(SIShrinkInstructionsPass());
 }
 
+void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
+AddMachinePass &addPass) const {
+  if (EnableDCEInRA)
+insertPass(DeadMachineInstructionElimPass());
+
+  // FIXME: when an instruction has a Killed operand, and the instruction is
+  // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
+  // the register in LiveVariables, this would trigger a failure in verifier,
+  // we should fix it and enable the verifier.
+  if (OptVGPRLiveRange)
+insertPass>(
+SIOptimizeVGPRLiveRangePass());
+
+  // This must be run immediately after phi elimination and before
+  // TwoAddressInstructions, otherwise the processing of the tied operand of
+  // SI_ELSE will introduce a copy of the tied operand source after the else.
+  insertPass(SILowerControlFlowPass());
+
+  if (EnableRewritePartialRegUses)
+insertPass(GCNRewritePartialRegUsesPass());
+
+  if (isPassEnabled(EnablePreRAOptimizations))
+insertPass(GCNPreRAOptimizationsPass());
 
+  // Allow the scheduler to run before SIWholeQuadMode inserts exec 
manipulation
+  // instructions that cause scheduling barriers.
+  insertPass(SIWholeQuadModePass());
+
+  if (OptExecMaskPreRA)
+insertPass(SIOptimizeExecMaskingPreRAPass());
+
+  // This is not an essential optimization and it has a noticeable impact on
+  // compilation time, so we only enable it from O2.
+  if (TM.getOptLevel() > CodeGenOptLevel::Less)
+insertPass(SIFormMemoryClausesPass());
+
+  Base::addOptimizedRegAlloc(addPass);
+}
 
 Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
 AddMachinePass &addPass) const {
@@ -2220,21 +2257,19 @@ Error 
AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
   addPass(SIPreAllocateWWMRegsPass());
 
   // For allocating other wwm register operands.
-  // addRegAlloc(addPass, RegAllocPhase::WWM);
   addPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}));
   addPass(SILowerWWMCopiesPass());
   addPass(VirtRegRewriterPass(false));
   addPass(AMDGPUReserveWWMRegsPass());
 
   // For allocating per-thread VGPRs.
-  // addRegAlloc(addPass, RegAllocPhase::VGPR);
   addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}));
 
 
   addPreRewrite(addPass);
   addPass(VirtRegRewriterPass(true));
 
-  // TODO: addPass(AMDGPUMarkLastScratchLoadPass());
+  addPass(AMDGPUMarkLastScratchLoadPass());
   return Error::success();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h 
b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 589123274d0f5..3c62cd19c6e57 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -182,6 +182,7 @@ class AMDGPUCodeGenPassBuilder
   void addPostRegAlloc(AddMachinePass &) const;
   void addPreEmitPass(AddMachinePass &) const;
   Error addRegAssignmentOptimized(AddMachinePass &) const;
+  void addOptimizedRegAlloc(AddMachinePass &) const;
 
   /// Check if a pass is enabled given \p Opt option. The option always
   /// overrides defaults if explicitly used. Otherwise its default will be used
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll 
b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 5155ec212c12f..0fa4619be53df 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -8,8 +8,11 @@
 ; RUN:   | FileCheck -check-prefix=GC

[llvm-branch-commits] [llvm] [AMDGPU][NPM] Complete optimized regalloc pipeline (PR #138491)

2025-07-07 Thread Christudasan Devadasan via llvm-branch-commits

https://github.com/cdevadas approved this pull request.


https://github.com/llvm/llvm-project/pull/138491
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CodeGen][NPM] Port InitUndef to NPM (PR #138495)

2025-07-07 Thread Akshat Oke via llvm-branch-commits

https://github.com/optimisan updated 
https://github.com/llvm/llvm-project/pull/138495

>From 7938c63eb0883f47cffd02219400780ede01e559 Mon Sep 17 00:00:00 2001
From: Akshat Oke 
Date: Mon, 5 May 2025 08:47:42 +
Subject: [PATCH 1/3] [CodeGen][NPM] Port InitUndef to NPM

---
 llvm/include/llvm/CodeGen/InitUndef.h | 24 +
 llvm/include/llvm/InitializePasses.h  |  2 +-
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |  1 +
 .../llvm/Passes/MachinePassRegistry.def   |  2 +-
 llvm/lib/CodeGen/CodeGen.cpp  |  2 +-
 llvm/lib/CodeGen/InitUndef.cpp| 50 +--
 llvm/lib/Passes/PassBuilder.cpp   |  1 +
 llvm/test/CodeGen/AArch64/init-undef.mir  |  3 ++
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  |  6 +--
 .../rvv/handle-noreg-with-implicit-def.mir|  2 +
 .../rvv/subregister-undef-early-clobber.mir   |  1 +
 .../RISCV/rvv/undef-earlyclobber-chain.mir|  1 +
 12 files changed, 74 insertions(+), 21 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/InitUndef.h

diff --git a/llvm/include/llvm/CodeGen/InitUndef.h 
b/llvm/include/llvm/CodeGen/InitUndef.h
new file mode 100644
index 0..7274824a74905
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/InitUndef.h
@@ -0,0 +1,24 @@
+//===- llvm/CodeGen/InitUndef.h *- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_CODEGEN_INITUNDEF_H
+#define LLVM_CODEGEN_INITUNDEF_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class InitUndefPass : public PassInfoMixin {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_INITUNDEF_H
diff --git a/llvm/include/llvm/InitializePasses.h 
b/llvm/include/llvm/InitializePasses.h
index 1b5b1d524..20462288ef667 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -326,7 +326,7 @@ LLVM_ABI void 
initializeTargetTransformInfoWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeTwoAddressInstructionLegacyPassPass(PassRegistry &);
 LLVM_ABI void initializeTypeBasedAAWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeTypePromotionLegacyPass(PassRegistry &);
-LLVM_ABI void initializeInitUndefPass(PassRegistry &);
+LLVM_ABI void initializeInitUndefLegacyPass(PassRegistry &);
 LLVM_ABI void initializeUniformityInfoWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeUnifyLoopExitsLegacyPassPass(PassRegistry &);
 LLVM_ABI void initializeUnpackMachineBundlesPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h 
b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 29bc432ba3d5d..a3f439119b7da 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -43,6 +43,7 @@
 #include "llvm/CodeGen/GlobalMerge.h"
 #include "llvm/CodeGen/GlobalMergeFunctions.h"
 #include "llvm/CodeGen/IndirectBrExpand.h"
+#include "llvm/CodeGen/InitUndef.h"
 #include "llvm/CodeGen/InterleavedAccess.h"
 #include "llvm/CodeGen/InterleavedLoadCombine.h"
 #include "llvm/CodeGen/JMCInstrumenter.h"
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def 
b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 518dc55acb99b..e03038921af99 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -149,6 +149,7 @@ MACHINE_FUNCTION_PASS("early-tailduplication", 
EarlyTailDuplicatePass())
 MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass())
 MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass())
 MACHINE_FUNCTION_PASS("fixup-statepoint-caller-saved", 
FixupStatepointCallerSavedPass())
+MACHINE_FUNCTION_PASS("init-undef", InitUndefPass())
 MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass())
 MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass())
 MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass())
@@ -306,7 +307,6 @@ DUMMY_MACHINE_FUNCTION_PASS("fs-profile-loader", 
MIRProfileLoaderNewPass)
 DUMMY_MACHINE_FUNCTION_PASS("funclet-layout", FuncletLayoutPass)
 DUMMY_MACHINE_FUNCTION_PASS("gc-empty-basic-blocks", GCEmptyBasicBlocksPass)
 DUMMY_MACHINE_FUNCTION_PASS("implicit-null-checks", ImplicitNullChecksPass)
-DUMMY_MACHINE_FUNCTION_PASS("init-undef-pass", InitUndefPass)
 DUMMY_MACHINE_FUNCTION_PASS("instruction-select", InstructionSelectPass)
 DUMMY_MACHINE_FUNCTION_PASS("irtranslator", IRTranslatorPass)
 DUMMY_MACHINE_FUNCTION_PASS("kcfi", MachineKCFIPass)
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 5250534d8a4e4..aa3591cb6be58 100644
--- a/llvm/lib/CodeGen/CodeG

[llvm-branch-commits] [libc] [libc] Modular printf option (float only) (PR #147426)

2025-07-07 Thread Petr Hosek via llvm-branch-commits


@@ -0,0 +1,41 @@
+#ifdef LIBC_COPT_PRINTF_MODULAR

petrhosek wrote:

This file needs the copyright header.

https://github.com/llvm/llvm-project/pull/147426
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/20.x: [AArch64][SME] Fix restoring callee-saves from FP with hazard padding (PR #144693)

2025-07-07 Thread Benjamin Maxwell via llvm-branch-commits

MacDue wrote:

I think it's reasonably safe given the general case (without hazard padding) is 
well used and tested, and there's been no issues reported since this landed a 
few weeks back.

https://github.com/llvm/llvm-project/pull/144693
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/20.x: [LoongArch] Pass OptLevel to LoongArchDAGToDAGISel correctly (PR #144459)

2025-07-07 Thread via llvm-branch-commits

leecheechen wrote:

Fixed a crash caused by incorrectly passing OptLevel to LoongArchDAGToDAGISel.

https://github.com/llvm/llvm-project/pull/144459
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-globalisel

Author: Sameer Sahasrabuddhe (ssahasra)


Changes

The memory legalizer is currently responsible for emitting wait instructions at 
ordering operations such as acquire and release. It tries to be efficient by 
emitting waits only when required. In particular, it does not emit a wait on 
vmcnt at workgroup scope since that ordering is already guaranteed by the 
architecture. But this is now incorrect because direct loads to LDS have an LDS 
component which needs explicit ordering on vmcnt. But it is inefficient to 
always emit a wait on vmcnt since majority of the programs do not use direct 
loads to LDS, and this will affect all workgroup scope operations.

As a first step to that, the memory legalizer now emits a soft wait instruction 
even if all counts are trivially ~0. This is a placeholder that the 
SIInsertWaitcnts pass will either optimize away or strenghthen based on its 
analysis of whether direct loads to LDS are pending at this point in the 
program.

---

Patch is 4.42 MiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/147257.diff


41 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+25-33) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+6-6) 
- (modified) 
llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll (+112) 
- (modified) 
llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll 
(+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+66-66) 
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll 
(+1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll (+1) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll 
(+64) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll 
(+168-6) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll (+220-4) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll (+160-32) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll 
(+1420) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll (+160-32) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (+14-2) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll 
(+1410) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll 
(+576-68) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll (+192) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll 
(+1152-52) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll (+168) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll 
(+14-1) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll 
(+1152-52) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll 
(+706-82) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll (+940-98) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll 
(+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll 
(+1548) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll 
(+940-98) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll 
(+27-7) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll 
(+1548) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll 
(+940-98) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir (+31) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll 
(+12) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir (+31) 
- (modified) 
llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll (+6) 
- (modified) llvm/test/CodeGen/AMDGPU/trap-abis.ll (+5) 
- (modified) llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir 
(+1) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp 
b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 3212060f303a5..f015d3ad7811e 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1074,8 +1074,6 @@ bool 
SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
 bool IsCrossAddrSpaceOrdering, Position 
Pos,
 A

[llvm-branch-commits] [llvm] [LV] Use VPReductionRecipe for partial reductions (PR #146073)

2025-07-07 Thread Sam Tebbs via llvm-branch-commits


@@ -2744,6 +2702,12 @@ class VPSingleDefBundleRecipe : public VPSingleDefRecipe 
{
 /// vector operands, performing a reduction.add on the result, and adding
 /// the scalar result to a chain.
 MulAccumulateReduction,
+/// Represent an inloop multiply-accumulate reduction, multiplying the
+/// extended vector operands, negating the multiplication, performing a
+/// reduction.add
+/// on the result, and adding
+/// the scalar result to a chain.
+ExtNegatedMulAccumulateReduction,

SamTebbs33 wrote:

Thanks Florian, that sounds like a good approach.

https://github.com/llvm/llvm-project/pull/146073
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Sameer Sahasrabuddhe (ssahasra)


Changes

The memory legalizer is currently responsible for emitting wait instructions at 
ordering operations such as acquire and release. It tries to be efficient by 
emitting waits only when required. In particular, it does not emit a wait on 
vmcnt at workgroup scope since that ordering is already guaranteed by the 
architecture. But this is now incorrect because direct loads to LDS have an LDS 
component which needs explicit ordering on vmcnt. But it is inefficient to 
always emit a wait on vmcnt since majority of the programs do not use direct 
loads to LDS, and this will affect all workgroup scope operations.

As a first step to that, the memory legalizer now emits a soft wait instruction 
even if all counts are trivially ~0. This is a placeholder that the 
SIInsertWaitcnts pass will either optimize away or strenghthen based on its 
analysis of whether direct loads to LDS are pending at this point in the 
program.

---

Patch is 4.42 MiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/147257.diff


41 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+25-33) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+6-6) 
- (modified) 
llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll (+112) 
- (modified) 
llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll 
(+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+66-66) 
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll 
(+1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll (+1) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll 
(+64) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll 
(+168-6) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll (+220-4) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll (+160-32) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll 
(+1420) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll (+160-32) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (+14-2) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll 
(+1410) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll 
(+576-68) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll (+192) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll 
(+1152-52) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll (+168) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll 
(+14-1) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll 
(+1152-52) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll 
(+706-82) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll (+940-98) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll 
(+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll 
(+1548) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll 
(+940-98) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll 
(+27-7) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll 
(+1548) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll 
(+940-98) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir (+31) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll 
(+12) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir (+31) 
- (modified) 
llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll (+6) 
- (modified) llvm/test/CodeGen/AMDGPU/trap-abis.ll (+5) 
- (modified) llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir 
(+1) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp 
b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 3212060f303a5..f015d3ad7811e 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1074,8 +1074,6 @@ bool 
SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
 bool IsCrossAddrSpaceOrdering, Position 
Pos,
 At

[llvm-branch-commits] [llvm] [AMDGPU] efficiently wait for direct loads to LDS at all scopes (PR #147258)

2025-07-07 Thread Sameer Sahasrabuddhe via llvm-branch-commits

https://github.com/ssahasra created 
https://github.com/llvm/llvm-project/pull/147258

Currently, the memory legalizer does not generate any wait on vmcnt at workgroup
scope. This is incorrect because direct loads to LDS are tracked using vmcnt and
they need to be released properly at workgroup scope.

The memory legalizer was previously updated to always emit a soft wait
instruction even when all counts are trivially ~0. SIInsertWaitcnts now examines
pending loads to LDS at each S_WAITCNT_soft instruction. If such instructions
exist, the vmcnt (which could be ~0) is upgraded to a value that wiats for any
such pending loads to LDS. After that, any soft instruction that has only
trivial ~0 counts is automatically dropped.

Thus, common programs that do not use direct loads to LDS remain unaffected, but
programs that do use such loads see a correct and efficient vmcnt even at
workgroup scope.

>From de111cd96570df7127722cb7df476cb833694f72 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe 
Date: Tue, 17 Jun 2025 13:11:55 +0530
Subject: [PATCH 1/2] [AMDGCN] pre-checkin test for LDS DMA and release
 operations

---
 .../AMDGPU/lds-dma-workgroup-release.ll   | 482 ++
 1 file changed, 482 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll

diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll 
b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
new file mode 100644
index 0..1db15c3c6099c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -0,0 +1,482 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s 
--check-prefixes=GFX900
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s 
--check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck %s 
--check-prefixes=GFX90A-TGSPLIT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s 
--check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck %s 
--check-prefixes=GFX942-TGSPLIT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s 
--check-prefixes=GFX1010
+
+; In each of these tests, an LDS DMA operation is followed by a release pattern
+; at workgroup scope. The fence in such a release (implicit or explicit) should
+; wait for the store component in the LDS DMA. The additional noalias metadata
+; is just meant to ensure that the wait counts are not generated due to some
+; unintended aliasing.
+
+declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr 
addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 
%aux)
+
+define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
+; GFX900-LABEL: barrier_release:
+; GFX900:   ; %bb.0: ; %main_body
+; GFX900-NEXT:s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX900-NEXT:v_mov_b32_e32 v0, 0x800
+; GFX900-NEXT:v_mov_b32_e32 v1, 0
+; GFX900-NEXT:s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:s_mov_b32 m0, s12
+; GFX900-NEXT:s_nop 0
+; GFX900-NEXT:buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX900-NEXT:v_mov_b32_e32 v0, s13
+; GFX900-NEXT:s_waitcnt vmcnt(0)
+; GFX900-NEXT:s_barrier
+; GFX900-NEXT:ds_read_b32 v0, v0
+; GFX900-NEXT:s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:global_store_dword v1, v0, s[14:15]
+; GFX900-NEXT:s_endpgm
+;
+; GFX90A-LABEL: barrier_release:
+; GFX90A:   ; %bb.1:
+; GFX90A-NEXT:s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-NEXT:s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT:s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:s_branch .LBB0_0
+; GFX90A-NEXT:.p2align 8
+; GFX90A-NEXT:  ; %bb.2:
+; GFX90A-NEXT:  .LBB0_0: ; %main_body
+; GFX90A-NEXT:s_mov_b32 m0, s12
+; GFX90A-NEXT:v_mov_b32_e32 v0, 0x800
+; GFX90A-NEXT:buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX90A-NEXT:v_mov_b32_e32 v0, s13
+; GFX90A-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX90A-NEXT:s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:s_barrier
+; GFX90A-NEXT:s_waitcnt vmcnt(0)
+; GFX90A-NEXT:ds_read_b32 v0, v0
+; GFX90A-NEXT:v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:global_store_dword v1, v0, s[0:1]
+; GFX90A-NEXT:s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: barrier_release:
+; GFX90A-TGSPLIT:   ; %bb.1:
+; GFX90A-TGSPLIT-NEXT:s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-TGSPLIT-NEXT:s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:s_branch .LBB0_0
+; GFX90A-TGSPLIT-NEXT:.p2align 8
+; GFX90A-TGSPLIT-NEXT:  ; %bb.2:
+; GFX90A-TGSPLIT-NEXT:  .LBB0_0: ; %main_body
+; GFX90A-TGSPLIT-NEXT:s_mov_b32 m0, s12
+; GFX90A-TGSPLIT-NEXT:v_mov_b32_e32 v0, 0x800
+; GFX90A-TGSPLIT-NEXT:buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX90A-TGSPLIT-NEXT:v_mov_b32_e32 v0, s13
+; G

[llvm-branch-commits] [llvm] [AMDGPU] efficiently wait for direct loads to LDS at all scopes (PR #147258)

2025-07-07 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Sameer Sahasrabuddhe (ssahasra)


Changes

Currently, the memory legalizer does not generate any wait on vmcnt at workgroup
scope. This is incorrect because direct loads to LDS are tracked using vmcnt and
they need to be released properly at workgroup scope.

The memory legalizer was previously updated to always emit a soft wait
instruction even when all counts are trivially ~0. SIInsertWaitcnts now examines
pending loads to LDS at each S_WAITCNT_soft instruction. If such instructions
exist, the vmcnt (which could be ~0) is upgraded to a value that wiats for any
such pending loads to LDS. After that, any soft instruction that has only
trivial ~0 counts is automatically dropped.

Thus, common programs that do not use direct loads to LDS remain unaffected, but
programs that do use such loads see a correct and efficient vmcnt even at
workgroup scope.

---

Patch is 22.89 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/147258.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+13) 
- (added) llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll (+482) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7ce1359f03da6..b57cfe5d6f2c5 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1374,6 +1374,19 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
 ScoreBrackets.simplifyWaitcnt(OldWait);
   Wait = Wait.combined(OldWait);
 
+  if (!WaitcntInstr && II.getOpcode() == AMDGPU::S_WAITCNT_soft) {
+// Each direct load to LDS is also a store to LDS, but we do not have a
+// separate counter for it. Instead these operations increment LOAD_CNT
+// and need to be waited for at a release fence. So we treat a release
+// fence as if it depends on any previous LDS DMA stores.
+//
+// Note that a user-specified S_WAITCNT instruction is not affected; we
+// only check for S_WAITCNT_soft since that represents a fence.
+//
+// FIXME: How does one detect that a soft wait is a release???
+ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
+  }
+
   // Merge consecutive waitcnt of the same type by erasing multiples.
   if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
 II.eraseFromParent();
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll 
b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
new file mode 100644
index 0..882c43b41bac8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -0,0 +1,482 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s 
--check-prefixes=GFX900
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s 
--check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck %s 
--check-prefixes=GFX90A-TGSPLIT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s 
--check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck %s 
--check-prefixes=GFX942-TGSPLIT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s 
--check-prefixes=GFX1010
+
+; In each of these tests, an LDS DMA operation is followed by a release pattern
+; at workgroup scope. The fence in such a release (implicit or explicit) should
+; wait for the store component in the LDS DMA. The additional noalias metadata
+; is just meant to ensure that the wait counts are not generated due to some
+; unintended aliasing.
+
+declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr 
addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 
%aux)
+
+define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
+; GFX900-LABEL: barrier_release:
+; GFX900:   ; %bb.0: ; %main_body
+; GFX900-NEXT:s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX900-NEXT:v_mov_b32_e32 v0, 0x800
+; GFX900-NEXT:v_mov_b32_e32 v1, 0
+; GFX900-NEXT:s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:s_mov_b32 m0, s12
+; GFX900-NEXT:s_nop 0
+; GFX900-NEXT:buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX900-NEXT:v_mov_b32_e32 v0, s13
+; GFX900-NEXT:s_waitcnt vmcnt(0)
+; GFX900-NEXT:s_barrier
+; GFX900-NEXT:ds_read_b32 v0, v0
+; GFX900-NEXT:s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:global_store_dword v1, v0, s[14:15]
+; GFX900-NEXT:s_endpgm
+;
+; GFX90A-LABEL: barrier_release:
+; GFX90A:   ; %bb.1:
+; GFX90A-NEXT:s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-NEXT:s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT:s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:s_branch .LBB0_0
+; GFX90A-NEXT:.p2align 8
+; GFX90A-NEXT:  ; %bb.2:

[llvm-branch-commits] [llvm] [SelectionDAG] Deal with POISON for INSERT_VECTOR_ELT/INSERT_SUBVECTOR (part 3) (PR #143105)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits


@@ -953,8 +953,17 @@ class SelectionDAG {
   }
 
   /// Insert \p SubVec at the \p Idx element of \p Vec.
+  /// If \p SkipUndef is true and \p SubVec is UNDEF/POISON, then \p Vec is
+  /// returned.

arsenm wrote:

This is a strange pattern that I feel like should be avoided. I don't 
understand why this would need semantic treatment. These get* functions should 
just return with the node with the requested operands 

https://github.com/llvm/llvm-project/pull/143105
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] ARM: Remove subtarget field tracking SjLj (PR #147226)

2025-07-07 Thread Daniel Kiss via llvm-branch-commits

https://github.com/DanielKristofKiss approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/147226
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Add canonical loop LLVM-IR lowering (PR #147069)

2025-07-07 Thread Michael Kruse via llvm-branch-commits

https://github.com/Meinersbur edited 
https://github.com/llvm/llvm-project/pull/147069
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread Sameer Sahasrabuddhe via llvm-branch-commits


@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1(
 ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f

ssahasra wrote:

Not directly related to this discussion, but this line does exist:
```
   1390   // Merge consecutive waitcnt of the same type by erasing 
multiples.
   1391   if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && 
TrySimplify)) {
```
It is meant to preserver S_WAITCNT_soft even if there is no actual wait 
required. @jayfoad , you had introduced `TrySimplify` ... do you think it is 
okay to relax its uses?

```
   1373   if (TrySimplify **|| (Opcode != II.getOpcode() && 
OldWait.hasValuesSetToMax()**)
   1374 ScoreBrackets.simplifyWaitcnt(OldWait);
```
Here, `hasValuesSetToMax()` is a hypothetical function that checks the encoding 
of each count separately to have all bits set to 1, and not just a ~0 in the 
data structure.

https://github.com/llvm/llvm-project/pull/147257
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread Sameer Sahasrabuddhe via llvm-branch-commits

https://github.com/ssahasra edited 
https://github.com/llvm/llvm-project/pull/147257
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Add canonical loop LLVM-IR lowering (PR #147069)

2025-07-07 Thread Michael Kruse via llvm-branch-commits

https://github.com/Meinersbur updated 
https://github.com/llvm/llvm-project/pull/147069

>From da2613d525deb4edcf0fac41e865ca0510c75210 Mon Sep 17 00:00:00 2001
From: Michael Kruse 
Date: Fri, 4 Jul 2025 16:26:20 +0200
Subject: [PATCH] omp.canonical_loop and omp.unroll_heuristic lowering

---
 .../mlir/Target/LLVMIR/ModuleTranslation.h|  43 +
 .../Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp  |  10 +
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp  |  78 
 .../LLVMIR/openmp-cli-canonical_loop.mlir | 175 ++
 .../LLVMIR/openmp-cli-unroll-heuristic01.mlir |  56 ++
 .../LLVMIR/openmp-cli-unroll-heuristic02.mlir |  93 ++
 6 files changed, 455 insertions(+)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-cli-canonical_loop.mlir
 create mode 100644 mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic01.mlir
 create mode 100644 mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic02.mlir

diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h 
b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index 79e8bb6add0da..5d52cf3f04b6a 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -15,6 +15,7 @@
 #define MLIR_TARGET_LLVMIR_MODULETRANSLATION_H
 
 #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/Value.h"
@@ -24,6 +25,7 @@
 #include "mlir/Target/LLVMIR/TypeToLLVM.h"
 
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/FPEnv.h"
 
 namespace llvm {
@@ -108,6 +110,41 @@ class ModuleTranslation {
 return blockMapping.lookup(block);
   }
 
+  /// Find the LLVM-IR loop that represents an MLIR loop.
+  llvm::CanonicalLoopInfo *lookupOMPLoop(omp::NewCliOp mlir) const {
+llvm::CanonicalLoopInfo *result = loopMapping.lookup(mlir);
+assert(result && "attempt to get non-existing loop");
+return result;
+  }
+
+  /// Find the LLVM-IR loop that represents an MLIR loop.
+  llvm::CanonicalLoopInfo *lookupOMPLoop(Value mlir) const {
+return lookupOMPLoop(mlir.getDefiningOp());
+  }
+
+  /// Mark an OpenMP loop as having been consumed.
+  void invalidateOmpLoop(omp::NewCliOp mlir) { loopMapping.erase(mlir); }
+
+  /// Mark an OpenMP loop as having been consumed.
+  void invalidateOmpLoop(Value mlir) {
+invalidateOmpLoop(mlir.getDefiningOp());
+  }
+
+  /// Map an MLIR OpenMP dialect CanonicalLoopInfo to its lowered LLVM-IR
+  /// OpenMPIRBuilder CanonicalLoopInfo
+  void mapOmpLoop(omp::NewCliOp mlir, llvm::CanonicalLoopInfo *llvm) {
+assert(llvm && "argument must be non-null");
+llvm::CanonicalLoopInfo *&cur = loopMapping[mlir];
+assert(cur == nullptr && "attempting to map a loop that is already 
mapped");
+cur = llvm;
+  }
+
+  /// Map an MLIR OpenMP dialect CanonicalLoopInfo to its lowered LLVM-IR
+  /// OpenMPIRBuilder CanonicalLoopInfo
+  void mapOmpLoop(Value mlir, llvm::CanonicalLoopInfo *llvm) {
+mapOmpLoop(mlir.getDefiningOp(), llvm);
+  }
+
   /// Stores the mapping between an MLIR operation with successors and a
   /// corresponding LLVM IR instruction.
   void mapBranch(Operation *mlir, llvm::Instruction *llvm) {
@@ -381,6 +418,12 @@ class ModuleTranslation {
   DenseMap valueMapping;
   DenseMap blockMapping;
 
+  /// List of not yet consumed MLIR loop handles (represented by an omp.new_cli
+  /// operation which creates a value of type CanonicalLoopInfoType) and their
+  /// LLVM-IR representation as CanonicalLoopInfo which is managed by the
+  /// OpenMPIRBuilder.
+  DenseMap loopMapping;
+
   /// A mapping between MLIR LLVM dialect terminators and LLVM IR terminators
   /// they are converted to. This allows for connecting PHI nodes to the source
   /// values after all operations are converted.
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp 
b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index 7a0a7f86bc1e9..e77c4a0b94de9 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -42,6 +42,16 @@ template 
 struct OpenMPOpConversion : public ConvertOpToLLVMPattern {
   using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
 
+  OpenMPOpConversion(LLVMTypeConverter &typeConverter,
+ PatternBenefit benefit = 1)
+  : ConvertOpToLLVMPattern(typeConverter, benefit) {
+// Operations using CanonicalLoopInfoType are lowered only by
+// mlir::translateModuleToLLVMIR() using the OpenMPIRBuilder. Until then,
+// the type and operations using it must be preserved.
+typeConverter.addConversion(
+[&](::mlir::omp::CanonicalLoopInfoType type) { return type; });
+  }
+
   LogicalResult
   matchAndRewrite(T op, typename T::Adaptor adaptor,
   ConversionPatternRewriter &rewriter) const override {
diff --git 
a/mlir/lib/Target/LLVMIR/Dialect/OpenMP

[llvm-branch-commits] [clang] [LifetimeSafety] Propagate loans using dataflow analysis (PR #147295)

2025-07-07 Thread Utkarsh Saxena via llvm-branch-commits

https://github.com/usx95 updated 
https://github.com/llvm/llvm-project/pull/147295

>From dd2dd838f44ebeb6d45f75af2934159ee61b385b Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena 
Date: Sun, 6 Jul 2025 19:12:55 +
Subject: [PATCH] [LifetimeSafety] Propagate loans using dataflow analysis

---
 clang/lib/Analysis/LifetimeSafety.cpp | 255 +-
 .../Sema/warn-lifetime-safety-dataflow.cpp| 186 +
 2 files changed, 440 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Analysis/LifetimeSafety.cpp 
b/clang/lib/Analysis/LifetimeSafety.cpp
index 2c2309de90e26..cdbab31ac7a9c 100644
--- a/clang/lib/Analysis/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety.cpp
@@ -482,7 +482,247 @@ class FactGenerator : public 
ConstStmtVisitor {
 };
 
 // = //
-//  TODO: Run dataflow analysis to propagate loans, analyse and error 
reporting.
+//  The Dataflow Lattice
+// = //
+
+// Using LLVM's immutable collections is efficient for dataflow analysis
+// as it avoids deep copies during state transitions.
+// TODO(opt): Consider using a bitset to represent the set of loans.
+using LoanSet = llvm::ImmutableSet;
+using OriginLoanMap = llvm::ImmutableMap;
+
+/// An object to hold the factories for immutable collections, ensuring
+/// that all created states share the same underlying memory management.
+struct LifetimeFactory {
+  OriginLoanMap::Factory OriginMapFact;
+  LoanSet::Factory LoanSetFact;
+
+  LoanSet createLoanSet(LoanID LID) {
+return LoanSetFact.add(LoanSetFact.getEmptySet(), LID);
+  }
+};
+
+/// LifetimeLattice represents the state of our analysis at a given program
+/// point. It is an immutable object, and all operations produce a new
+/// instance rather than modifying the existing one.
+struct LifetimeLattice {
+  /// The map from an origin to the set of loans it contains.
+  /// TODO(opt): To reduce the lattice size, propagate origins of declarations,
+  /// not expressions, because expressions are not visible across blocks.
+  OriginLoanMap Origins = OriginLoanMap(nullptr);
+
+  explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {}
+  LifetimeLattice() = default;
+
+  bool operator==(const LifetimeLattice &Other) const {
+return Origins == Other.Origins;
+  }
+  bool operator!=(const LifetimeLattice &Other) const {
+return !(*this == Other);
+  }
+
+  LoanSet getLoans(OriginID OID, LifetimeFactory &Factory) const {
+if (auto *Loans = Origins.lookup(OID))
+  return *Loans;
+return Factory.LoanSetFact.getEmptySet();
+  }
+
+  /// Computes the union of two lattices by performing a key-wise join of
+  /// their OriginLoanMaps.
+  // TODO(opt): This key-wise join is a performance bottleneck. A more
+  // efficient merge could be implemented using a Patricia Trie or HAMT
+  // instead of the current AVL-tree-based ImmutableMap.
+  LifetimeLattice join(const LifetimeLattice &Other,
+   LifetimeFactory &Factory) const {
+/// Merge the smaller map into the larger one ensuring we iterate over the
+/// smaller map.
+if (Origins.getHeight() < Other.Origins.getHeight())
+  return Other.join(*this, Factory);
+
+OriginLoanMap JoinedState = Origins;
+// For each origin in the other map, union its loan set with ours.
+for (const auto &Entry : Other.Origins) {
+  OriginID OID = Entry.first;
+  LoanSet OtherLoanSet = Entry.second;
+  JoinedState = Factory.OriginMapFact.add(
+  JoinedState, OID,
+  join(getLoans(OID, Factory), OtherLoanSet, Factory));
+}
+return LifetimeLattice(JoinedState);
+  }
+
+  LoanSet join(LoanSet a, LoanSet b, LifetimeFactory &Factory) const {
+/// Merge the smaller set into the larger one ensuring we iterate over the
+/// smaller set.
+if (a.getHeight() < b.getHeight())
+  std::swap(a, b);
+LoanSet Result = a;
+for (LoanID LID : b) {
+  /// TODO(opt): Profiling shows that this loop is a major performance
+  /// bottleneck. Investigate using a BitVector to represent the set of
+  /// loans for improved join performance.
+  Result = Factory.LoanSetFact.add(Result, LID);
+}
+return Result;
+  }
+
+  void dump(llvm::raw_ostream &OS) const {
+OS << "LifetimeLattice State:\n";
+if (Origins.isEmpty())
+  OS << "  \n";
+for (const auto &Entry : Origins) {
+  if (Entry.second.isEmpty())
+OS << "  Origin " << Entry.first << " contains no loans\n";
+  for (const LoanID &LID : Entry.second)
+OS << "  Origin " << Entry.first << " contains Loan " << LID << "\n";
+}
+  }
+};
+
+// = //
+//  The Transfer Function
+// = //
+class T

[llvm-branch-commits] [clang] [LifetimeSafety] Implement dataflow analysis for loan propagation (PR #147295)

2025-07-07 Thread Utkarsh Saxena via llvm-branch-commits

https://github.com/usx95 edited https://github.com/llvm/llvm-project/pull/147295
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [MLIR][OpenMP] Add canonical loop LLVM-IR lowering (PR #147069)

2025-07-07 Thread via llvm-branch-commits

llvmbot wrote:



@llvm/pr-subscribers-mlir-llvm

@llvm/pr-subscribers-mlir

Author: Michael Kruse (Meinersbur)


Changes

Support for translating the operations introduced in #144785 to LLVM-IR.

In order to keep the lowering simple, `OpenMPIRBuider::unrollLoopHeuristic` is 
applied when encountering the `omp.unroll_heuristic` op. As a result, the 
operation that unrolling is applied to (`omp.canonical_loop`) must have been 
emitted before even though logically there is no such requirement. 

Eventually, all transformations on a loop must be applied directly after 
emitting `omp.canonical_loop`, i.e. future transformations must be looked-up 
when encountering `omp.canonical_loop` itself. This is because many 
OpenMPIRBuilder methods (e.g. `createParallel`) expect all the region code to 
be emitted withing a callback. In the case of `createParallel`, the region code 
is getting outlined into a new function. Therefore, making the operation 
requirement an IR order would not make the implementation any easier.

---

Patch is 21.16 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/147069.diff


6 Files Affected:

- (modified) mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h (+43) 
- (modified) mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp (+10) 
- (modified) 
mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+78) 
- (added) mlir/test/Target/LLVMIR/openmp-cli-canonical_loop.mlir (+175) 
- (added) mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic01.mlir (+56) 
- (added) mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic02.mlir (+93) 


``diff
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h 
b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index 79e8bb6add0da..5d52cf3f04b6a 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -15,6 +15,7 @@
 #define MLIR_TARGET_LLVMIR_MODULETRANSLATION_H
 
 #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/Value.h"
@@ -24,6 +25,7 @@
 #include "mlir/Target/LLVMIR/TypeToLLVM.h"
 
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/FPEnv.h"
 
 namespace llvm {
@@ -108,6 +110,41 @@ class ModuleTranslation {
 return blockMapping.lookup(block);
   }
 
+  /// Find the LLVM-IR loop that represents an MLIR loop.
+  llvm::CanonicalLoopInfo *lookupOMPLoop(omp::NewCliOp mlir) const {
+llvm::CanonicalLoopInfo *result = loopMapping.lookup(mlir);
+assert(result && "attempt to get non-existing loop");
+return result;
+  }
+
+  /// Find the LLVM-IR loop that represents an MLIR loop.
+  llvm::CanonicalLoopInfo *lookupOMPLoop(Value mlir) const {
+return lookupOMPLoop(mlir.getDefiningOp());
+  }
+
+  /// Mark an OpenMP loop as having been consumed.
+  void invalidateOmpLoop(omp::NewCliOp mlir) { loopMapping.erase(mlir); }
+
+  /// Mark an OpenMP loop as having been consumed.
+  void invalidateOmpLoop(Value mlir) {
+invalidateOmpLoop(mlir.getDefiningOp());
+  }
+
+  /// Map an MLIR OpenMP dialect CanonicalLoopInfo to its lowered LLVM-IR
+  /// OpenMPIRBuilder CanonicalLoopInfo
+  void mapOmpLoop(omp::NewCliOp mlir, llvm::CanonicalLoopInfo *llvm) {
+assert(llvm && "argument must be non-null");
+llvm::CanonicalLoopInfo *&cur = loopMapping[mlir];
+assert(cur == nullptr && "attempting to map a loop that is already 
mapped");
+cur = llvm;
+  }
+
+  /// Map an MLIR OpenMP dialect CanonicalLoopInfo to its lowered LLVM-IR
+  /// OpenMPIRBuilder CanonicalLoopInfo
+  void mapOmpLoop(Value mlir, llvm::CanonicalLoopInfo *llvm) {
+mapOmpLoop(mlir.getDefiningOp(), llvm);
+  }
+
   /// Stores the mapping between an MLIR operation with successors and a
   /// corresponding LLVM IR instruction.
   void mapBranch(Operation *mlir, llvm::Instruction *llvm) {
@@ -381,6 +418,12 @@ class ModuleTranslation {
   DenseMap valueMapping;
   DenseMap blockMapping;
 
+  /// List of not yet consumed MLIR loop handles (represented by an omp.new_cli
+  /// operation which creates a value of type CanonicalLoopInfoType) and their
+  /// LLVM-IR representation as CanonicalLoopInfo which is managed by the
+  /// OpenMPIRBuilder.
+  DenseMap loopMapping;
+
   /// A mapping between MLIR LLVM dialect terminators and LLVM IR terminators
   /// they are converted to. This allows for connecting PHI nodes to the source
   /// values after all operations are converted.
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp 
b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index 77a2708653576..7ac9687c4eeda 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -41,6 +41,16 @@ template 
 struct OpenMPOpConversion : public ConvertOpToLLVMPattern {
   using ConvertOpToLLVMPattern

[llvm-branch-commits] [mlir] [MLIR][OpenMP] Add canonical loop LLVM-IR lowering (PR #147069)

2025-07-07 Thread Michael Kruse via llvm-branch-commits

https://github.com/Meinersbur ready_for_review 
https://github.com/llvm/llvm-project/pull/147069
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [LifetimeSafety] Propagate loans using dataflow analysis (PR #147295)

2025-07-07 Thread Utkarsh Saxena via llvm-branch-commits

https://github.com/usx95 created 
https://github.com/llvm/llvm-project/pull/147295

None

>From 2e4261b02b6230a8c79f01a673cc3030cfff3ea7 Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena 
Date: Sun, 6 Jul 2025 19:12:55 +
Subject: [PATCH 1/6] [LifetimeSafety] Propagate loans using dataflow analysis

---
 clang/lib/Analysis/LifetimeSafety.cpp | 255 +-
 .../Sema/warn-lifetime-safety-dataflow.cpp| 186 +
 2 files changed, 440 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Analysis/LifetimeSafety.cpp 
b/clang/lib/Analysis/LifetimeSafety.cpp
index 3fe30e36ebd0f..7870352f0287a 100644
--- a/clang/lib/Analysis/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety.cpp
@@ -491,7 +491,247 @@ class FactGenerator : public 
ConstStmtVisitor {
 };
 
 // = //
-//  TODO: Run dataflow analysis to propagate loans, analyse and error 
reporting.
+//  The Dataflow Lattice
+// = //
+
+// Using LLVM's immutable collections is efficient for dataflow analysis
+// as it avoids deep copies during state transitions.
+// TODO(opt): Consider using a bitset to represent the set of loans.
+using LoanSet = llvm::ImmutableSet;
+using OriginLoanMap = llvm::ImmutableMap;
+
+/// An object to hold the factories for immutable collections, ensuring
+/// that all created states share the same underlying memory management.
+struct LifetimeFactory {
+  OriginLoanMap::Factory OriginMapFact;
+  LoanSet::Factory LoanSetFact;
+
+  LoanSet createLoanSet(LoanID LID) {
+return LoanSetFact.add(LoanSetFact.getEmptySet(), LID);
+  }
+};
+
+/// LifetimeLattice represents the state of our analysis at a given program
+/// point. It is an immutable object, and all operations produce a new
+/// instance rather than modifying the existing one.
+struct LifetimeLattice {
+  /// The map from an origin to the set of loans it contains.
+  /// TODO(opt): To reduce the lattice size, propagate origins of declarations,
+  /// not expressions, because expressions are not visible across blocks.
+  OriginLoanMap Origins = OriginLoanMap(nullptr);
+
+  explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {}
+  LifetimeLattice() = default;
+
+  bool operator==(const LifetimeLattice &Other) const {
+return Origins == Other.Origins;
+  }
+  bool operator!=(const LifetimeLattice &Other) const {
+return !(*this == Other);
+  }
+
+  LoanSet getLoans(OriginID OID, LifetimeFactory &Factory) const {
+if (auto *Loans = Origins.lookup(OID))
+  return *Loans;
+return Factory.LoanSetFact.getEmptySet();
+  }
+
+  /// Computes the union of two lattices by performing a key-wise join of
+  /// their OriginLoanMaps.
+  // TODO(opt): This key-wise join is a performance bottleneck. A more
+  // efficient merge could be implemented using a Patricia Trie or HAMT
+  // instead of the current AVL-tree-based ImmutableMap.
+  LifetimeLattice join(const LifetimeLattice &Other,
+   LifetimeFactory &Factory) const {
+/// Merge the smaller map into the larger one ensuring we iterate over the
+/// smaller map.
+if (Origins.getHeight() < Other.Origins.getHeight())
+  return Other.join(*this, Factory);
+
+OriginLoanMap JoinedState = Origins;
+// For each origin in the other map, union its loan set with ours.
+for (const auto &Entry : Other.Origins) {
+  OriginID OID = Entry.first;
+  LoanSet OtherLoanSet = Entry.second;
+  JoinedState = Factory.OriginMapFact.add(
+  JoinedState, OID,
+  join(getLoans(OID, Factory), OtherLoanSet, Factory));
+}
+return LifetimeLattice(JoinedState);
+  }
+
+  LoanSet join(LoanSet a, LoanSet b, LifetimeFactory &Factory) const {
+/// Merge the smaller set into the larger one ensuring we iterate over the
+/// smaller set.
+if (a.getHeight() < b.getHeight())
+  std::swap(a, b);
+LoanSet Result = a;
+for (LoanID LID : b) {
+  /// TODO(opt): Profiling shows that this loop is a major performance
+  /// bottleneck. Investigate using a BitVector to represent the set of
+  /// loans for improved join performance.
+  Result = Factory.LoanSetFact.add(Result, LID);
+}
+return Result;
+  }
+
+  void dump(llvm::raw_ostream &OS) const {
+OS << "LifetimeLattice State:\n";
+if (Origins.isEmpty())
+  OS << "  \n";
+for (const auto &Entry : Origins) {
+  if (Entry.second.isEmpty())
+OS << "  Origin " << Entry.first << " contains no loans\n";
+  for (const LoanID &LID : Entry.second)
+OS << "  Origin " << Entry.first << " contains Loan " << LID << "\n";
+}
+  }
+};
+
+// = //
+//  The Transfer Function
+// = /

[llvm-branch-commits] [clang] [LifetimeSafety] Propagate loans using dataflow analysis (PR #147295)

2025-07-07 Thread Utkarsh Saxena via llvm-branch-commits

https://github.com/usx95 updated 
https://github.com/llvm/llvm-project/pull/147295

>From 2e4261b02b6230a8c79f01a673cc3030cfff3ea7 Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena 
Date: Sun, 6 Jul 2025 19:12:55 +
Subject: [PATCH 1/6] [LifetimeSafety] Propagate loans using dataflow analysis

---
 clang/lib/Analysis/LifetimeSafety.cpp | 255 +-
 .../Sema/warn-lifetime-safety-dataflow.cpp| 186 +
 2 files changed, 440 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Analysis/LifetimeSafety.cpp 
b/clang/lib/Analysis/LifetimeSafety.cpp
index 3fe30e36ebd0f..7870352f0287a 100644
--- a/clang/lib/Analysis/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety.cpp
@@ -491,7 +491,247 @@ class FactGenerator : public 
ConstStmtVisitor {
 };
 
 // = //
-//  TODO: Run dataflow analysis to propagate loans, analyse and error 
reporting.
+//  The Dataflow Lattice
+// = //
+
+// Using LLVM's immutable collections is efficient for dataflow analysis
+// as it avoids deep copies during state transitions.
+// TODO(opt): Consider using a bitset to represent the set of loans.
+using LoanSet = llvm::ImmutableSet;
+using OriginLoanMap = llvm::ImmutableMap;
+
+/// An object to hold the factories for immutable collections, ensuring
+/// that all created states share the same underlying memory management.
+struct LifetimeFactory {
+  OriginLoanMap::Factory OriginMapFact;
+  LoanSet::Factory LoanSetFact;
+
+  LoanSet createLoanSet(LoanID LID) {
+return LoanSetFact.add(LoanSetFact.getEmptySet(), LID);
+  }
+};
+
+/// LifetimeLattice represents the state of our analysis at a given program
+/// point. It is an immutable object, and all operations produce a new
+/// instance rather than modifying the existing one.
+struct LifetimeLattice {
+  /// The map from an origin to the set of loans it contains.
+  /// TODO(opt): To reduce the lattice size, propagate origins of declarations,
+  /// not expressions, because expressions are not visible across blocks.
+  OriginLoanMap Origins = OriginLoanMap(nullptr);
+
+  explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {}
+  LifetimeLattice() = default;
+
+  bool operator==(const LifetimeLattice &Other) const {
+return Origins == Other.Origins;
+  }
+  bool operator!=(const LifetimeLattice &Other) const {
+return !(*this == Other);
+  }
+
+  LoanSet getLoans(OriginID OID, LifetimeFactory &Factory) const {
+if (auto *Loans = Origins.lookup(OID))
+  return *Loans;
+return Factory.LoanSetFact.getEmptySet();
+  }
+
+  /// Computes the union of two lattices by performing a key-wise join of
+  /// their OriginLoanMaps.
+  // TODO(opt): This key-wise join is a performance bottleneck. A more
+  // efficient merge could be implemented using a Patricia Trie or HAMT
+  // instead of the current AVL-tree-based ImmutableMap.
+  LifetimeLattice join(const LifetimeLattice &Other,
+   LifetimeFactory &Factory) const {
+/// Merge the smaller map into the larger one ensuring we iterate over the
+/// smaller map.
+if (Origins.getHeight() < Other.Origins.getHeight())
+  return Other.join(*this, Factory);
+
+OriginLoanMap JoinedState = Origins;
+// For each origin in the other map, union its loan set with ours.
+for (const auto &Entry : Other.Origins) {
+  OriginID OID = Entry.first;
+  LoanSet OtherLoanSet = Entry.second;
+  JoinedState = Factory.OriginMapFact.add(
+  JoinedState, OID,
+  join(getLoans(OID, Factory), OtherLoanSet, Factory));
+}
+return LifetimeLattice(JoinedState);
+  }
+
+  LoanSet join(LoanSet a, LoanSet b, LifetimeFactory &Factory) const {
+/// Merge the smaller set into the larger one ensuring we iterate over the
+/// smaller set.
+if (a.getHeight() < b.getHeight())
+  std::swap(a, b);
+LoanSet Result = a;
+for (LoanID LID : b) {
+  /// TODO(opt): Profiling shows that this loop is a major performance
+  /// bottleneck. Investigate using a BitVector to represent the set of
+  /// loans for improved join performance.
+  Result = Factory.LoanSetFact.add(Result, LID);
+}
+return Result;
+  }
+
+  void dump(llvm::raw_ostream &OS) const {
+OS << "LifetimeLattice State:\n";
+if (Origins.isEmpty())
+  OS << "  \n";
+for (const auto &Entry : Origins) {
+  if (Entry.second.isEmpty())
+OS << "  Origin " << Entry.first << " contains no loans\n";
+  for (const LoanID &LID : Entry.second)
+OS << "  Origin " << Entry.first << " contains Loan " << LID << "\n";
+}
+  }
+};
+
+// = //
+//  The Transfer Function
+// = //
+cla

[llvm-branch-commits] [llvm] AtomicExpand: Stop using report_fatal_error (PR #147300)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/147300

Emit a context error and delete the instruction. This
allows removing the AMDGPU hack where some atomic libcalls
are falsely added. NVPTX also later copied the same hack,
so remove it there too.

For now just emit the generic error, which is not good. It's
missing any useful context information (despite taking the instruction).
It's also confusing in the failed atomicrmw case, since it's reporting
failure at the intermediate failed cmpxchg instead of the original
atomicrmw.

>From 4d46f60b03774704354e98ccea89d4c622c7d300 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 7 Jul 2025 21:25:22 +0900
Subject: [PATCH] AtomicExpand: Stop using report_fatal_error

Emit a context error and delete the instruction. This
allows removing the AMDGPU hack where some atomic libcalls
are falsely added. NVPTX also later copied the same hack,
so remove it there too.

For now just emit the generic error, which is not good. It's
missing any useful context information (despite taking the instruction).
It's also confusing in the failed atomicrmw case, since it's reporting
failure at the intermediate failed cmpxchg instead of the original
atomicrmw.
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp |  17 +-
 llvm/lib/IR/RuntimeLibcalls.cpp   |   6 +-
 llvm/test/CodeGen/AMDGPU/atomic-oversize.ll   |  10 -
 .../CodeGen/AMDGPU/unsupported-atomics.ll |  55 ++
 .../CodeGen/NVPTX/atomicrmw-expand.err.ll |  27 +
 llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll   |  28 -
 .../AMDGPU/expand-atomic-fp128.ll | 122 ---
 .../AtomicExpand/AMDGPU/expand-atomic-i128.ll | 201 -
 .../AtomicExpand/AMDGPU/expand-atomic-mmra.ll |  25 -
 .../AMDGPU/expand-atomicrmw-fp-vector.ll  | 752 +-
 .../AtomicExpand/AMDGPU/unaligned-atomic.ll   |  22 +-
 .../TableGen/Basic/RuntimeLibcallsEmitter.cpp |  19 +-
 12 files changed, 106 insertions(+), 1178 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/atomic-oversize.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll
 delete mode 100644 
llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-fp128.ll
 delete mode 100644 
llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i128.ll

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp 
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 044f0732779f3..44295b44482e7 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -65,6 +65,17 @@ class AtomicExpandImpl {
   const DataLayout *DL = nullptr;
 
 private:
+  void handleFailure(Instruction &FailedInst, const Twine &Msg) const {
+LLVMContext &Ctx = FailedInst.getContext();
+
+// TODO: Do not use generic error type
+Ctx.emitError(&FailedInst, Msg);
+
+if (!FailedInst.getType()->isVoidTy())
+  FailedInst.replaceAllUsesWith(PoisonValue::get(FailedInst.getType()));
+FailedInst.eraseFromParent();
+  }
+
   bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
   IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
   LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
@@ -1744,7 +1755,7 @@ void AtomicExpandImpl::expandAtomicLoadToLibcall(LoadInst 
*I) {
   I, Size, I->getAlign(), I->getPointerOperand(), nullptr, nullptr,
   I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   if (!expanded)
-report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Load");
+handleFailure(*I, "unsupported atomic load");
 }
 
 void AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) {
@@ -1757,7 +1768,7 @@ void 
AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) {
   I, Size, I->getAlign(), I->getPointerOperand(), I->getValueOperand(),
   nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   if (!expanded)
-report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Store");
+handleFailure(*I, "unsupported atomic store");
 }
 
 void AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
@@ -1772,7 +1783,7 @@ void 
AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
   I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
   Libcalls);
   if (!expanded)
-report_fatal_error("expandAtomicOpToLibcall shouldn't fail for CAS");
+handleFailure(*I, "unsupported cmpxchg");
 }
 
 static ArrayRef GetRMWLibcall(AtomicRMWInst::BinOp Op) {
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 712f1a48d0b7b..b21504037be8f 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -455,10 +455,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
 
   // Disable most libcalls on AMDGPU and NVPTX.
   if (TT.isAMDGPU() || TT.isNVPTX()) {
-for (RTLIB::Libcall LC : RTLIB::libcalls()) {
-  if (!isAtomicLibCall(LC))
-   

[llvm-branch-commits] [llvm] AtomicExpand: Stop using report_fatal_error (PR #147300)

2025-07-07 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes

Emit a context error and delete the instruction. This
allows removing the AMDGPU hack where some atomic libcalls
are falsely added. NVPTX also later copied the same hack,
so remove it there too.

For now just emit the generic error, which is not good. It's
missing any useful context information (despite taking the instruction).
It's also confusing in the failed atomicrmw case, since it's reporting
failure at the intermediate failed cmpxchg instead of the original
atomicrmw.

---

Patch is 92.08 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/147300.diff


12 Files Affected:

- (modified) llvm/lib/CodeGen/AtomicExpandPass.cpp (+14-3) 
- (modified) llvm/lib/IR/RuntimeLibcalls.cpp (+2-4) 
- (removed) llvm/test/CodeGen/AMDGPU/atomic-oversize.ll (-10) 
- (added) llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll (+55) 
- (added) llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll (+27) 
- (modified) llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll (-28) 
- (removed) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-fp128.ll 
(-122) 
- (removed) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i128.ll 
(-201) 
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll 
(-25) 
- (modified) 
llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll (+4-748) 
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll 
(+3-19) 
- (modified) llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp (+1-18) 


``diff
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp 
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 044f0732779f3..44295b44482e7 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -65,6 +65,17 @@ class AtomicExpandImpl {
   const DataLayout *DL = nullptr;
 
 private:
+  void handleFailure(Instruction &FailedInst, const Twine &Msg) const {
+LLVMContext &Ctx = FailedInst.getContext();
+
+// TODO: Do not use generic error type
+Ctx.emitError(&FailedInst, Msg);
+
+if (!FailedInst.getType()->isVoidTy())
+  FailedInst.replaceAllUsesWith(PoisonValue::get(FailedInst.getType()));
+FailedInst.eraseFromParent();
+  }
+
   bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
   IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
   LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
@@ -1744,7 +1755,7 @@ void AtomicExpandImpl::expandAtomicLoadToLibcall(LoadInst 
*I) {
   I, Size, I->getAlign(), I->getPointerOperand(), nullptr, nullptr,
   I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   if (!expanded)
-report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Load");
+handleFailure(*I, "unsupported atomic load");
 }
 
 void AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) {
@@ -1757,7 +1768,7 @@ void 
AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) {
   I, Size, I->getAlign(), I->getPointerOperand(), I->getValueOperand(),
   nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   if (!expanded)
-report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Store");
+handleFailure(*I, "unsupported atomic store");
 }
 
 void AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
@@ -1772,7 +1783,7 @@ void 
AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
   I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
   Libcalls);
   if (!expanded)
-report_fatal_error("expandAtomicOpToLibcall shouldn't fail for CAS");
+handleFailure(*I, "unsupported cmpxchg");
 }
 
 static ArrayRef GetRMWLibcall(AtomicRMWInst::BinOp Op) {
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 712f1a48d0b7b..b21504037be8f 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -455,10 +455,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
 
   // Disable most libcalls on AMDGPU and NVPTX.
   if (TT.isAMDGPU() || TT.isNVPTX()) {
-for (RTLIB::Libcall LC : RTLIB::libcalls()) {
-  if (!isAtomicLibCall(LC))
-setLibcallImpl(LC, RTLIB::Unsupported);
-}
+for (RTLIB::Libcall LC : RTLIB::libcalls())
+  setLibcallImpl(LC, RTLIB::Unsupported);
   }
 
   if (TT.isOSMSVCRT()) {
diff --git a/llvm/test/CodeGen/AMDGPU/atomic-oversize.ll 
b/llvm/test/CodeGen/AMDGPU/atomic-oversize.ll
deleted file mode 100644
index f62a93f523365..0
--- a/llvm/test/CodeGen/AMDGPU/atomic-oversize.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s 
| FileCheck %s
-
-define void @test(ptr %a) nounwind {
-; CHECK-LABEL: test:
-; CHECK: __atomic_load_16
-; CHECK: __atomic_store_16
-  %1 = load atomic i128, ptr %a seq_cst, align 16
-  store atomic i128 %1, ptr %a seq_cst, align 

[llvm-branch-commits] [llvm] AtomicExpand: Stop using report_fatal_error (PR #147300)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/147300?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#147300** https://app.graphite.dev/github/pr/llvm/llvm-project/147300?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/147300?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#147299** https://app.graphite.dev/github/pr/llvm/llvm-project/147299?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/147300
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LV] Bundle partial reductions inside VPExpressionRecipe (PR #147302)

2025-07-07 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Sam Tebbs (SamTebbs33)


Changes

This PR bundles partial reductions inside the VPExpressionRecipe class.

Depends on https://github.com/llvm/llvm-project/pull/147255 .

---

Patch is 202.63 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/147302.diff


16 Files Affected:

- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+2) 
- (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+15-4) 
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+1-1) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+6-2) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+23) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+7-4) 
- (modified) 
llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll (+55-35) 
- (modified) 
llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
 (+2-2) 
- (modified) 
llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll 
(+98-98) 
- (modified) 
llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll 
(+42-42) 
- (modified) 
llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll 
(+299-279) 
- (modified) 
llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll 
(+14-22) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll 
(+12-12) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll 
(+11-20) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll 
(+4-7) 
- (modified) 
llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll (+26-26) 


``diff
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h 
b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 3cc0ea01953c3..338599a9bb5aa 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -223,6 +223,8 @@ class TargetTransformInfo {
   /// Get the kind of extension that an instruction represents.
   LLVM_ABI static PartialReductionExtendKind
   getPartialReductionExtendKind(Instruction *I);
+  LLVM_ABI static PartialReductionExtendKind
+  getPartialReductionExtendKind(Instruction::CastOps CastOpc);
 
   /// Construct a TTI object using a type implementing the \c Concept
   /// API below.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp 
b/llvm/lib/Analysis/TargetTransformInfo.cpp
index ba0d070bffe6d..5e9733a264e22 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1001,13 +1001,24 @@ InstructionCost TargetTransformInfo::getShuffleCost(
 
 TargetTransformInfo::PartialReductionExtendKind
 TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
-  if (isa(I))
-return PR_SignExtend;
-  if (isa(I))
-return PR_ZeroExtend;
+  if (auto *Cast = dyn_cast(I))
+return getPartialReductionExtendKind(Cast->getOpcode());
   return PR_None;
 }
 
+TargetTransformInfo::PartialReductionExtendKind
+TargetTransformInfo::getPartialReductionExtendKind(
+Instruction::CastOps CastOpc) {
+  switch (CastOpc) {
+  case Instruction::CastOps::ZExt:
+return PR_ZeroExtend;
+  case Instruction::CastOps::SExt:
+return PR_SignExtend;
+  default:
+return PR_None;
+  }
+}
+
 TTI::CastContextHint
 TargetTransformInfo::getCastContextHint(const Instruction *I) {
   if (!I)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp 
b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d9a367535baf4..5021a490839b2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5294,7 +5294,7 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost(
   EVT ResVT = TLI->getValueType(DL, ResTy);
 
   if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
-  VecVT.getSizeInBits() >= 64) {
+  VecVT.isFixedLengthVector() && VecVT.getSizeInBits() >= 64) {
 std::pair LT = getTypeLegalizationCost(VecTy);
 
 // The legal cases are:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h 
b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1bc926db301d8..30f3566332d79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2470,7 +2470,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
 
   static inline bool classof(const VPRecipeBase *R) {
 return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
-   R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
+   R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
+   R->getVPDefID() == VPRecipeBase::VPPartialReductionSC;
   }
 
   static inline bool classof(const VPUser *U) {
@@ -2532,7 +2533,10 @@ class VPPartialReductionRecipe : public 
VPReductionRecipe {
 Opcode(Opcode), VFScaleFactor(ScaleFactor) {
 [

[llvm-branch-commits] [llvm] AtomicExpand: Stop using report_fatal_error (PR #147300)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/147300
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] MSP430: Move libcall CC setting to RuntimeLibcallsInfo (PR #146081)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/146081

>From ca7e199c05935ba53568fe96520acce04b5727c1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 23 Jun 2025 16:35:14 +0900
Subject: [PATCH] MSP430: Move libcall CC setting to RuntimeLibcallsInfo

As a temporary step configure the calling convention here. This
can't be moved into tablegen until RuntimeLibcallsInfo is split
into a separate lowering component.
---
 llvm/lib/IR/RuntimeLibcalls.cpp   | 5 +
 llvm/lib/Target/MSP430/MSP430ISelLowering.cpp | 2 --
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 1376ffc7c7293..8c3257147213d 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -364,6 +364,11 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
 RTLIB::HEXAGON_MEMCPY_LIKELY_ALIGNED_MIN32BYTES_MULT8BYTES,
 RTLIB::__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes);
   }
+
+  if (TT.getArch() == Triple::ArchType::msp430) {
+setLibcallImplCallingConv(RTLIB::__mspabi_mpyll,
+  CallingConv::MSP430_BUILTIN);
+  }
 }
 
 bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp 
b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 20d1781946f0f..d23504c203dd3 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -208,8 +208,6 @@ MSP430TargetLowering::MSP430TargetLowering(const 
TargetMachine &TM,
 for (const auto &LC : LibraryCalls) {
   setLibcallImpl(LC.Op, LC.Impl);
 }
-setLibcallImplCallingConv(RTLIB::__mspabi_mpyll,
-  CallingConv::MSP430_BUILTIN);
   }
 
   setMinFunctionAlignment(Align(2));

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] MSP430: Move libcall CC setting to RuntimeLibcallsInfo (PR #146081)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/146081

>From ca7e199c05935ba53568fe96520acce04b5727c1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 23 Jun 2025 16:35:14 +0900
Subject: [PATCH] MSP430: Move libcall CC setting to RuntimeLibcallsInfo

As a temporary step configure the calling convention here. This
can't be moved into tablegen until RuntimeLibcallsInfo is split
into a separate lowering component.
---
 llvm/lib/IR/RuntimeLibcalls.cpp   | 5 +
 llvm/lib/Target/MSP430/MSP430ISelLowering.cpp | 2 --
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 1376ffc7c7293..8c3257147213d 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -364,6 +364,11 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
 RTLIB::HEXAGON_MEMCPY_LIKELY_ALIGNED_MIN32BYTES_MULT8BYTES,
 RTLIB::__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes);
   }
+
+  if (TT.getArch() == Triple::ArchType::msp430) {
+setLibcallImplCallingConv(RTLIB::__mspabi_mpyll,
+  CallingConv::MSP430_BUILTIN);
+  }
 }
 
 bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp 
b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 20d1781946f0f..d23504c203dd3 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -208,8 +208,6 @@ MSP430TargetLowering::MSP430TargetLowering(const 
TargetMachine &TM,
 for (const auto &LC : LibraryCalls) {
   setLibcallImpl(LC.Op, LC.Impl);
 }
-setLibcallImplCallingConv(RTLIB::__mspabi_mpyll,
-  CallingConv::MSP430_BUILTIN);
   }
 
   setMinFunctionAlignment(Align(2));

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] Lanai: Use TableGen to set libcall calling conventions (PR #146080)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/146080

>From a492a7e695c2f543b6caba066f5d8beb4272b8cf Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 23 Jun 2025 16:17:26 +0900
Subject: [PATCH] Lanai: Use TableGen to set libcall calling conventions

---
 llvm/include/llvm/IR/RuntimeLibcalls.td | 12 
 llvm/lib/Target/Lanai/LanaiISelLowering.cpp |  4 
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td 
b/llvm/include/llvm/IR/RuntimeLibcalls.td
index f8667269ec8cb..c15ffa0653335 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -1389,6 +1389,18 @@ def __hexagon_fast2_sqrtdf2 : 
RuntimeLibcallImpl;
 def __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
 : RuntimeLibcallImpl;
 
+//===--===//
+// Lanai Runtime Libcalls
+//===--===//
+
+def isLanai : RuntimeLibcallPredicate<"TT.getArch() == Triple::lanai">;
+
+// Use fast calling convention for library functions.
+def LanaiSystemLibrary
+: SystemRuntimeLibrary {
+  let DefaultLibcallCallingConv = FASTCC;
+}
+
 
//===--===//
 // Mips16 Runtime Libcalls
 
//===--===//
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp 
b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index 6fb73c5d18966..d23c5f43ad4ff 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -150,10 +150,6 @@ LanaiTargetLowering::LanaiTargetLowering(const 
TargetMachine &TM,
   // statements. Re-evaluate this on new benchmarks.
   setMinimumJumpTableEntries(100);
 
-  // Use fast calling convention for library functions.
-  for (RTLIB::LibcallImpl LC : RTLIB::libcall_impls())
-setLibcallImplCallingConv(LC, CallingConv::Fast);
-
   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] ARM: Start moving runtime libcalls into tablegen (PR #146084)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/146084

>From 8eb2e09e5f533bbf706445437d7cf5590d775fab Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 23 Jun 2025 23:23:01 +0900
Subject: [PATCH] ARM: Start moving runtime libcalls into tablegen

We still need to manually set the calling conventions of
some libcalls until the lowering is separated out.
---
 llvm/include/llvm/IR/RuntimeLibcalls.h|  2 +-
 llvm/include/llvm/IR/RuntimeLibcalls.td   | 48 
 llvm/lib/IR/RuntimeLibcalls.cpp   | 73 +--
 .../RuntimeLibcallEmitter-calling-conv.td |  2 +-
 llvm/test/TableGen/RuntimeLibcallEmitter.td   |  2 +-
 .../TableGen/Basic/RuntimeLibcallsEmitter.cpp |  2 +-
 6 files changed, 53 insertions(+), 76 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h 
b/llvm/include/llvm/IR/RuntimeLibcalls.h
index ac83df3a4189e..4ea5ff9c9ade8 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -170,7 +170,7 @@ struct RuntimeLibcallsInfo {
   void initDefaultLibCallImpls();
 
   /// Generated by tablegen.
-  void setTargetRuntimeLibcallSets(const Triple &TT);
+  void setTargetRuntimeLibcallSets(const Triple &TT, FloatABI::ABIType 
FloatABI);
 
   /// Set default libcall names. If a target wants to opt-out of a libcall it
   /// should be placed here.
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td 
b/llvm/include/llvm/IR/RuntimeLibcalls.td
index c15ffa0653335..0237c8b41ae8c 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -17,6 +17,7 @@ class DuplicateLibcallImplWithPrefix
 
 /// Libcall Predicates
 def isOSDarwin : RuntimeLibcallPredicate<"TT.isOSDarwin()">;
+def isOSWindows : RuntimeLibcallPredicate<"TT.isOSWindows()">;
 
 def darwinHasSinCosStret : RuntimeLibcallPredicate<"darwinHasSinCosStret(TT)">;
 def darwinHasExp10 : RuntimeLibcallPredicate<"darwinHasExp10(TT)">;
@@ -1272,6 +1273,7 @@ def __aeabi_memclr4 : RuntimeLibcallImpl;
 def __aeabi_memclr8 : RuntimeLibcallImpl;
 
 // isTargetWindows()
+defset list WindowsFPIntCastLibcalls = {
 def __stoi64 : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_VFP
 def __dtoi64 : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_VFP
 def __stou64 : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_VFP
@@ -1280,6 +1282,7 @@ def __i64tos : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_V
 def __i64tod : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_VFP
 def __u64tos : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_VFP
 def __u64tod : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_VFP
+}
 
 def __rt_sdiv : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS
 def __rt_sdiv64 : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS
@@ -1306,6 +1309,51 @@ def __aeabi_h2f : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS
 def __gnu_f2h_ieee : RuntimeLibcallImpl;
 def __gnu_h2f_ieee : RuntimeLibcallImpl;
 
+
+def WindowARMDivRemCalls : LibcallImpls<
+  (add __rt_sdiv, __rt_sdiv64, __rt_udiv, __rt_udiv64),
+  isOSWindows> {
+  let CallingConv = ARM_AAPCS;
+}
+
+def WindowARMFPIntCasts : LibcallImpls<
+  (add WindowsFPIntCastLibcalls),
+  isOSWindows> {
+  let CallingConv = ARM_AAPCS_VFP;
+}
+
+
+// Register based DivRem for AEABI (RTABI 4.2)
+def AEABIDivRemCalls : LibcallImpls<
+  (add __aeabi_idivmod, __aeabi_ldivmod,
+   __aeabi_uidivmod, __aeabi_uldivmod),
+  RuntimeLibcallPredicate<[{TT.isTargetAEABI() || TT.isAndroid() || 
TT.isTargetGNUAEABI() ||
+TT.isTargetMuslAEABI()}]>> {
+  let CallingConv = ARM_AAPCS;
+}
+
+def isARMOrThumb : RuntimeLibcallPredicate<"TT.isARM() || TT.isThumb()">;
+
+def ARMSystemLibrary
+: SystemRuntimeLibrary>)> {
+  let DefaultLibcallCallingConv = LibcallCallingConv<[{
+ (!TT.isOSDarwin() && !TT.isiOS() && !TT.isWatchOS() && !TT.isDriverKit()) 
?
+(FloatABI == FloatABI::Hard ? CallingConv::ARM_AAPCS_VFP
+: CallingConv::ARM_AAPCS) :
+  CallingConv::C
+  }]>;
+}
+
 
//===--===//
 // AVR Runtime Libcalls
 
//===--===//
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index c2d0b0684ec39..2d168befd145c 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -25,77 +25,6 @@ static cl::opt
 static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT,
FloatABI::ABIType FloatABIType,
EABI EABIVersion) {
-  if (!TT.isOSDarwin() && !TT.isiOS() && !TT.isWatchOS() && !TT.isDriverKit()) 
{
-CallingConv::ID DefaultCC = FloatABIType == FloatABI::Hard
-? CallingConv::ARM_AAPCS_VFP
-: CallingConv::ARM_AAPCS;
-for (RTLIB::LibcallImpl LC : RTLIB::libcall_i

[llvm-branch-commits] [llvm] TableGen: Handle setting runtime libcall calling conventions (PR #144980)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/144980

>From 414f451ba5714d7aef14c2fdd7f95fc7f7d8be19 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 13 Jun 2025 15:54:41 +0900
Subject: [PATCH] TableGen: Handle setting runtime libcall calling conventions

Allow associating a non-default CallingConv with a set of library
functions, and applying a default for a SystemLibrary.

I also wanted to be able to apply a default calling conv
to a RuntimeLibcallImpl, but that turned out to be annoying
so leave it for later.
---
 llvm/include/llvm/IR/RuntimeLibcalls.td   | 140 +--
 llvm/include/llvm/IR/RuntimeLibcallsImpl.td   |  27 ++-
 llvm/lib/IR/RuntimeLibcalls.cpp   | 141 ---
 .../RuntimeLibcallEmitter-calling-conv.td | 128 ++
 llvm/test/TableGen/RuntimeLibcallEmitter.td   |  19 +-
 .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 163 ++
 6 files changed, 429 insertions(+), 189 deletions(-)
 create mode 100644 llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td 
b/llvm/include/llvm/IR/RuntimeLibcalls.td
index e6fffa2c7f933..f8667269ec8cb 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -1311,11 +1311,12 @@ def __gnu_h2f_ieee : RuntimeLibcallImpl;
 
//===--===//
 
 // Several of the runtime library functions use a special calling conv
-def __divmodqi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN
-def __divmodhi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN
+def __divmodqi4 : RuntimeLibcallImpl;
+def __divmodhi4 : RuntimeLibcallImpl;
+def __udivmodqi4 : RuntimeLibcallImpl;
+def __udivmodhi4 : RuntimeLibcallImpl;
+
 //def __divmodsi4 : RuntimeLibcallImpl;
-def __udivmodqi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN
-def __udivmodhi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN
 //def __udivmodsi4 : RuntimeLibcallImpl;
 
 // Standard sinf/cosf name replaced with "sin" and "cos". Define a
@@ -1341,9 +1342,12 @@ def AVRSystemLibrary
// Standard f64 names are replaced
sin, cos, sinf, cosf),
 
-  __divmodqi4, __divmodhi4, __divmodsi4, __udivmodqi4, 
__udivmodhi4,
-  __udivmodsi4,
-
+  // Several of the runtime library functions use a special calling
+  // conv
+  LibcallsWithCC<(add __divmodqi4, __divmodhi4, __udivmodqi4,
+ __udivmodhi4),
+ AVR_BUILTIN>,
+  __divmodsi4, __udivmodsi4,
   // Trigonometric rtlib functions
   avr_sin, avr_cos)>;
 
@@ -1566,6 +1570,117 @@ def __mspabi_mpyll : RuntimeLibcallImpl;
 
 // setLibcallCallingConv(MUL_I64, CallingConv::MSP430_BUILTIN);
 
+def isMSP430 : RuntimeLibcallPredicate<"TT.getArch() == Triple::msp430">;
+
+defvar MSP430DefaultOptOut = [
+  __addsf3, __divsf3, __extendsfdf2, __truncdfsf2, __fixsfsi,
+  __fixsfdi, __fixunssfsi, __mulsf3, __eqsf2, __gesf2, __gtsf2,
+  __divhi3, __divsi3, __ashlsi3, __floatsidf, __floatsisf,
+  __ashrsi3, __modhi3, __udivsi3, __fixdfsi, __fixunssfdi,
+  __udivhi3, __umodsi3, __nesf2, __lesf2, __floatundisf,
+  __fixdfdi, __fixunsdfsi, __modsi3, __floatunsisf,
+  __fixunsdfdi, __ltsf2, __floatdisf, __floatdidf,
+  __lshrsi3, __subsf3, __umodhi3, __floatunsidf,
+  __floatundidf
+];
+
+// EABI Libcalls - EABI Section 6.2
+def MSP430SystemLibrary
+: SystemRuntimeLibrary,
+  __mspabi_cmpf__oeq,
+  __mspabi_cmpf__une,
+  __mspabi_cmpf__oge,
+  __mspabi_cmpf__olt,
+  __mspabi_cmpf__ole,
+  __mspabi_cmpf__ogt,
+
+  // Floating point arithmetic - EABI Table 8
+  LibcallsWithCC<(add __mspabi_addd,
+  __mspabi_subd,
+  __mspabi_mpyd,
+  __mspabi_divd), MSP430_BUILTIN>,
+
+  __mspabi_addf,
+  __mspabi_subf,
+  __mspabi_mpyf,
+  __mspabi_divf,
+
+  // The following are NOT implemented in libgcc
+  // __mspabi_negd,
+  // __mspabi_negf,
+
+  // Universal Integer Operations - EABI Table 9
+  __mspabi_divi,
+  __mspabi_divli,
+  LibcallsWithCC<(add __mspabi_divlli), MSP430_BUILTIN>,
+  __mspabi_divu,
+  __mspabi_divul,
+  LibcallsWithCC<(add __mspabi_divull), MSP430_BUILTIN>,
+  __mspabi_remi,
+  __mspabi_remli,
+  LibcallsWithCC<(add __mspabi_remlli), MSP430_BUILTIN>,
+  __mspabi_remu,
+  __mspabi_remul,
+  LibcallsWithCC<(add __mspabi_remull), MSP430_BUILTIN>,
+
+  // Bitwise Operations - EABI Table 10
+  // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc
+  __mspabi_srll,
+  __mspabi_sral,
+  __mspabi_slll
+  // __mspabi_[srlll/srall/s/rlli/rlll] are NOT implemented in libgcc
+  )
+>;
+
 
//===--===//
 // NVPTX Runtime Libcalls
 
//===---

[llvm-branch-commits] [llvm] TableGen: Handle setting runtime libcall calling conventions (PR #144980)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/144980

>From 414f451ba5714d7aef14c2fdd7f95fc7f7d8be19 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 13 Jun 2025 15:54:41 +0900
Subject: [PATCH] TableGen: Handle setting runtime libcall calling conventions

Allow associating a non-default CallingConv with a set of library
functions, and applying a default for a SystemLibrary.

I also wanted to be able to apply a default calling conv
to a RuntimeLibcallImpl, but that turned out to be annoying
so leave it for later.
---
 llvm/include/llvm/IR/RuntimeLibcalls.td   | 140 +--
 llvm/include/llvm/IR/RuntimeLibcallsImpl.td   |  27 ++-
 llvm/lib/IR/RuntimeLibcalls.cpp   | 141 ---
 .../RuntimeLibcallEmitter-calling-conv.td | 128 ++
 llvm/test/TableGen/RuntimeLibcallEmitter.td   |  19 +-
 .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 163 ++
 6 files changed, 429 insertions(+), 189 deletions(-)
 create mode 100644 llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td 
b/llvm/include/llvm/IR/RuntimeLibcalls.td
index e6fffa2c7f933..f8667269ec8cb 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -1311,11 +1311,12 @@ def __gnu_h2f_ieee : RuntimeLibcallImpl;
 
//===--===//
 
 // Several of the runtime library functions use a special calling conv
-def __divmodqi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN
-def __divmodhi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN
+def __divmodqi4 : RuntimeLibcallImpl;
+def __divmodhi4 : RuntimeLibcallImpl;
+def __udivmodqi4 : RuntimeLibcallImpl;
+def __udivmodhi4 : RuntimeLibcallImpl;
+
 //def __divmodsi4 : RuntimeLibcallImpl;
-def __udivmodqi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN
-def __udivmodhi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN
 //def __udivmodsi4 : RuntimeLibcallImpl;
 
 // Standard sinf/cosf name replaced with "sin" and "cos". Define a
@@ -1341,9 +1342,12 @@ def AVRSystemLibrary
// Standard f64 names are replaced
sin, cos, sinf, cosf),
 
-  __divmodqi4, __divmodhi4, __divmodsi4, __udivmodqi4, 
__udivmodhi4,
-  __udivmodsi4,
-
+  // Several of the runtime library functions use a special calling
+  // conv
+  LibcallsWithCC<(add __divmodqi4, __divmodhi4, __udivmodqi4,
+ __udivmodhi4),
+ AVR_BUILTIN>,
+  __divmodsi4, __udivmodsi4,
   // Trigonometric rtlib functions
   avr_sin, avr_cos)>;
 
@@ -1566,6 +1570,117 @@ def __mspabi_mpyll : RuntimeLibcallImpl;
 
 // setLibcallCallingConv(MUL_I64, CallingConv::MSP430_BUILTIN);
 
+def isMSP430 : RuntimeLibcallPredicate<"TT.getArch() == Triple::msp430">;
+
+defvar MSP430DefaultOptOut = [
+  __addsf3, __divsf3, __extendsfdf2, __truncdfsf2, __fixsfsi,
+  __fixsfdi, __fixunssfsi, __mulsf3, __eqsf2, __gesf2, __gtsf2,
+  __divhi3, __divsi3, __ashlsi3, __floatsidf, __floatsisf,
+  __ashrsi3, __modhi3, __udivsi3, __fixdfsi, __fixunssfdi,
+  __udivhi3, __umodsi3, __nesf2, __lesf2, __floatundisf,
+  __fixdfdi, __fixunsdfsi, __modsi3, __floatunsisf,
+  __fixunsdfdi, __ltsf2, __floatdisf, __floatdidf,
+  __lshrsi3, __subsf3, __umodhi3, __floatunsidf,
+  __floatundidf
+];
+
+// EABI Libcalls - EABI Section 6.2
+def MSP430SystemLibrary
+: SystemRuntimeLibrary,
+  __mspabi_cmpf__oeq,
+  __mspabi_cmpf__une,
+  __mspabi_cmpf__oge,
+  __mspabi_cmpf__olt,
+  __mspabi_cmpf__ole,
+  __mspabi_cmpf__ogt,
+
+  // Floating point arithmetic - EABI Table 8
+  LibcallsWithCC<(add __mspabi_addd,
+  __mspabi_subd,
+  __mspabi_mpyd,
+  __mspabi_divd), MSP430_BUILTIN>,
+
+  __mspabi_addf,
+  __mspabi_subf,
+  __mspabi_mpyf,
+  __mspabi_divf,
+
+  // The following are NOT implemented in libgcc
+  // __mspabi_negd,
+  // __mspabi_negf,
+
+  // Universal Integer Operations - EABI Table 9
+  __mspabi_divi,
+  __mspabi_divli,
+  LibcallsWithCC<(add __mspabi_divlli), MSP430_BUILTIN>,
+  __mspabi_divu,
+  __mspabi_divul,
+  LibcallsWithCC<(add __mspabi_divull), MSP430_BUILTIN>,
+  __mspabi_remi,
+  __mspabi_remli,
+  LibcallsWithCC<(add __mspabi_remlli), MSP430_BUILTIN>,
+  __mspabi_remu,
+  __mspabi_remul,
+  LibcallsWithCC<(add __mspabi_remull), MSP430_BUILTIN>,
+
+  // Bitwise Operations - EABI Table 10
+  // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc
+  __mspabi_srll,
+  __mspabi_sral,
+  __mspabi_slll
+  // __mspabi_[srlll/srall/s/rlli/rlll] are NOT implemented in libgcc
+  )
+>;
+
 
//===--===//
 // NVPTX Runtime Libcalls
 
//===---

[llvm-branch-commits] [llvm] RuntimeLibcalls: Remove table of soft float compare cond codes (PR #146082)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/146082

>From effe1ad6d053a4dffccc3d68574868565ce94397 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 23 Jun 2025 19:10:30 +0900
Subject: [PATCH] RuntimeLibcalls: Remove table of soft float compare cond
 codes

Previously we had a table of entries for every Libcall for
the comparison to use against an integer 0 if it was a soft
float compare function. This was only relevant to a handful of
opcodes, so it was wasteful. Now that we can distinguish the
abstract libcall for the compare with the concrete implementation,
we can just directly hardcode the comparison against the libcall
impl without this configuration system.
---
 .../include/llvm/CodeGen/RuntimeLibcallUtil.h |   3 -
 llvm/include/llvm/CodeGen/TargetLowering.h|  17 +-
 llvm/include/llvm/IR/RuntimeLibcalls.h|  32 +---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  16 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp   | 107 +++
 llvm/lib/IR/RuntimeLibcalls.cpp   |  31 ---
 llvm/lib/Target/ARM/ARMISelLowering.cpp   | 176 +-
 7 files changed, 182 insertions(+), 200 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h 
b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
index 7481ed5b80b3f..09a8151e9ec9c 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
@@ -105,9 +105,6 @@ LLVM_ABI Libcall 
getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
 /// UNKNOW_LIBCALL if there is none.
 LLVM_ABI Libcall getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
 
-/// Initialize the default condition code on the libcalls.
-LLVM_ABI void initCmpLibcallCCs(ISD::CondCode *CmpLibcallCCs);
-
 } // namespace RTLIB
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h 
b/llvm/include/llvm/CodeGen/TargetLowering.h
index fee94cc167363..fa46d296bf533 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3571,19 +3571,10 @@ class LLVM_ABI TargetLoweringBase {
 
   const char *getMemcpyName() const { return Libcalls.getMemcpyName(); }
 
-  /// Override the default CondCode to be used to test the result of the
-  /// comparison libcall against zero.
-  /// FIXME: This should be removed
-  void setCmpLibcallCC(RTLIB::Libcall Call, CmpInst::Predicate Pred) {
-Libcalls.setSoftFloatCmpLibcallPredicate(Call, Pred);
-  }
-
-  /// Get the CondCode that's to be used to test the result of the comparison
-  /// libcall against zero.
-  CmpInst::Predicate
-  getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const {
-return Libcalls.getSoftFloatCmpLibcallPredicate(Call);
-  }
+  /// Get the comparison predicate that's to be used to test the result of the
+  /// comparison libcall against zero. This should only be used with
+  /// floating-point compare libcalls.
+  ISD::CondCode getSoftFloatCmpLibcallPredicate(RTLIB::LibcallImpl Call) const;
 
   /// Set the CallingConv that should be used for the specified libcall.
   void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) {
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h 
b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 66d11c4cbabb7..ac83df3a4189e 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -59,7 +59,6 @@ struct RuntimeLibcallsInfo {
   ExceptionHandling ExceptionModel = ExceptionHandling::None,
   FloatABI::ABIType FloatABI = FloatABI::Default,
   EABI EABIVersion = EABI::Default, StringRef ABIName = "") {
-initSoftFloatCmpLibcallPredicates();
 initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion, ABIName);
   }
 
@@ -106,22 +105,6 @@ struct RuntimeLibcallsInfo {
 return ArrayRef(LibcallImpls).drop_front();
   }
 
-  /// Get the comparison predicate that's to be used to test the result of the
-  /// comparison libcall against zero. This should only be used with
-  /// floating-point compare libcalls.
-  // FIXME: This should be a function of RTLIB::LibcallImpl
-  CmpInst::Predicate
-  getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const {
-return SoftFloatCompareLibcallPredicates[Call];
-  }
-
-  // FIXME: This should be removed. This should be private constant.
-  // FIXME: This should be a function of RTLIB::LibcallImpl
-  void setSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call,
-   CmpInst::Predicate Pred) {
-SoftFloatCompareLibcallPredicates[Call] = Pred;
-  }
-
   /// Return a function name compatible with RTLIB::MEMCPY, or nullptr if fully
   /// unsupported.
   const char *getMemcpyName() const {
@@ -132,6 +115,11 @@ struct RuntimeLibcallsInfo {
 return getLibcallName(RTLIB::MEMMOVE);
   }
 
+  /// Return the libcall provided by \p Impl
+  static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) {
+return ImplToLibcall[Impl];
+  }
+
 priva

[llvm-branch-commits] [llvm] Lanai: Use TableGen to set libcall calling conventions (PR #146080)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/146080

>From a492a7e695c2f543b6caba066f5d8beb4272b8cf Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 23 Jun 2025 16:17:26 +0900
Subject: [PATCH] Lanai: Use TableGen to set libcall calling conventions

---
 llvm/include/llvm/IR/RuntimeLibcalls.td | 12 
 llvm/lib/Target/Lanai/LanaiISelLowering.cpp |  4 
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td 
b/llvm/include/llvm/IR/RuntimeLibcalls.td
index f8667269ec8cb..c15ffa0653335 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -1389,6 +1389,18 @@ def __hexagon_fast2_sqrtdf2 : 
RuntimeLibcallImpl;
 def __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
 : RuntimeLibcallImpl;
 
+//===--===//
+// Lanai Runtime Libcalls
+//===--===//
+
+def isLanai : RuntimeLibcallPredicate<"TT.getArch() == Triple::lanai">;
+
+// Use fast calling convention for library functions.
+def LanaiSystemLibrary
+: SystemRuntimeLibrary {
+  let DefaultLibcallCallingConv = FASTCC;
+}
+
 
//===--===//
 // Mips16 Runtime Libcalls
 
//===--===//
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp 
b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index 6fb73c5d18966..d23c5f43ad4ff 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -150,10 +150,6 @@ LanaiTargetLowering::LanaiTargetLowering(const 
TargetMachine &TM,
   // statements. Re-evaluate this on new benchmarks.
   setMinimumJumpTableEntries(100);
 
-  // Use fast calling convention for library functions.
-  for (RTLIB::LibcallImpl LC : RTLIB::libcall_impls())
-setLibcallImplCallingConv(LC, CallingConv::Fast);
-
   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] RuntimeLibcalls: Associate calling convention with libcall impls (PR #144979)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/144979

>From 5a9b9d9720f01a7575f33feb73042eafcfa3f82c Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 17 Jun 2025 16:25:50 +0900
Subject: [PATCH] RuntimeLibcalls: Associate calling convention with libcall
 impls

Instead of associating the libcall with the RTLIB::Libcall, put it
into a table indexed by the RTLIB::LibcallImpl. The LibcallImpls
should contain all ABI details for a particular implementation, not
the abstract Libcall. In the future the wrappers in terms of the
RTLIB::Libcall should be removed.
---
 llvm/include/llvm/CodeGen/TargetLowering.h| 16 -
 llvm/include/llvm/IR/RuntimeLibcalls.h| 32 ++---
 llvm/lib/IR/RuntimeLibcalls.cpp   | 70 +++
 llvm/lib/Target/ARM/ARMISelLowering.cpp   | 18 ++---
 llvm/lib/Target/Lanai/LanaiISelLowering.cpp   |  4 +-
 llvm/lib/Target/MSP430/MSP430ISelLowering.cpp |  3 +-
 6 files changed, 92 insertions(+), 51 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h 
b/llvm/include/llvm/CodeGen/TargetLowering.h
index 420f1d5fb20ca..fee94cc167363 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3559,6 +3559,11 @@ class LLVM_ABI TargetLoweringBase {
 Libcalls.setLibcallImpl(Call, Impl);
   }
 
+  /// Get the libcall impl routine name for the specified libcall.
+  RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const {
+return Libcalls.getLibcallImpl(Call);
+  }
+
   /// Get the libcall routine name for the specified libcall.
   const char *getLibcallName(RTLIB::Libcall Call) const {
 return Libcalls.getLibcallName(Call);
@@ -3581,11 +3586,18 @@ class LLVM_ABI TargetLoweringBase {
   }
 
   /// Set the CallingConv that should be used for the specified libcall.
-  void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
-Libcalls.setLibcallCallingConv(Call, CC);
+  void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) {
+Libcalls.setLibcallImplCallingConv(Call, CC);
+  }
+
+  /// Get the CallingConv that should be used for the specified libcall
+  /// implementation.
+  CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
+return Libcalls.getLibcallImplCallingConv(Call);
   }
 
   /// Get the CallingConv that should be used for the specified libcall.
+  // FIXME: Remove this wrapper and directly use the used LibcallImpl
   CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
 return Libcalls.getLibcallCallingConv(Call);
   }
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h 
b/llvm/include/llvm/IR/RuntimeLibcalls.h
index c8d97bcd2e664..66d11c4cbabb7 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -37,6 +37,10 @@ template <> struct enum_iteration_traits {
   static constexpr bool is_iterable = true;
 };
 
+template <> struct enum_iteration_traits {
+  static constexpr bool is_iterable = true;
+};
+
 namespace RTLIB {
 
 // Return an iterator over all Libcall values.
@@ -44,6 +48,10 @@ static inline auto libcalls() {
   return enum_seq(static_cast(0), RTLIB::UNKNOWN_LIBCALL);
 }
 
+static inline auto libcall_impls() {
+  return enum_seq(static_cast(1), RTLIB::NumLibcallImpls);
+}
+
 /// A simple container for information about the supported runtime calls.
 struct RuntimeLibcallsInfo {
   explicit RuntimeLibcallsInfo(
@@ -76,16 +84,21 @@ struct RuntimeLibcallsInfo {
 return LibcallImpls[Call];
   }
 
-  /// Set the CallingConv that should be used for the specified libcall.
-  // FIXME: This should be a function of RTLIB::LibcallImpl
-  void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
-LibcallCallingConvs[Call] = CC;
+  /// Set the CallingConv that should be used for the specified libcall
+  /// implementation
+  void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) {
+LibcallImplCallingConvs[Call] = CC;
   }
 
-  /// Get the CallingConv that should be used for the specified libcall.
-  // FIXME: This should be a function of RTLIB::LibcallImpl
+  // FIXME: Remove this wrapper in favor of directly using
+  // getLibcallImplCallingConv
   CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
-return LibcallCallingConvs[Call];
+return LibcallImplCallingConvs[LibcallImpls[Call]];
+  }
+
+  /// Get the CallingConv that should be used for the specified libcall.
+  CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
+return LibcallImplCallingConvs[Call];
   }
 
   ArrayRef getLibcallImpls() const {
@@ -130,8 +143,9 @@ struct RuntimeLibcallsInfo {
   static_assert(static_cast(CallingConv::C) == 0,
 "default calling conv should be encoded as 0");
 
-  /// Stores the CallingConv that should be used for each libcall.
-  CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL] = {};
+  /// Stores th

[llvm-branch-commits] [llvm] ARM: Unconditionally set eabi libcall calling convs in RuntimeLibcalls (PR #146083)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/146083

>From c61003aede9ddd5db0503428f4dc500718e85028 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 23 Jun 2025 20:14:11 +0900
Subject: [PATCH 1/3] ARM: Unconditionally set eabi libcall calling convs in
 RuntimeLibcalls

This fully consolidates all the calling convention configuration into
RuntimeLibcallInfo. I'm assuming that __aeabi functions have a universal
calling convention, and on other ABIs just don't use them. This will
enable splitting of RuntimeLibcallInfo into the ABI and lowering component.
---
 llvm/lib/IR/RuntimeLibcalls.cpp |  39 ++
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 159 +++-
 2 files changed, 110 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index dec766698dc1d..e62743860d53d 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -98,6 +98,45 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, 
const Triple &TT,
 Info.setLibcallImpl(RTLIB::SDIVREM_I32, RTLIB::__divmodsi4);
 Info.setLibcallImpl(RTLIB::UDIVREM_I32, RTLIB::__udivmodsi4);
   }
+
+  static const RTLIB::LibcallImpl AAPCS_Libcalls[] = {
+  RTLIB::__aeabi_dadd,   RTLIB::__aeabi_ddiv,
+  RTLIB::__aeabi_dmul,   RTLIB::__aeabi_dsub,
+  RTLIB::__aeabi_dcmpeq__ne, RTLIB::__aeabi_dcmpeq__eq,
+  RTLIB::__aeabi_dcmplt, RTLIB::__aeabi_dcmple,
+  RTLIB::__aeabi_dcmpge, RTLIB::__aeabi_dcmpgt,
+  RTLIB::__aeabi_dcmpun, RTLIB::__aeabi_fadd,
+  RTLIB::__aeabi_fdiv,   RTLIB::__aeabi_fmul,
+  RTLIB::__aeabi_fsub,   RTLIB::__aeabi_fcmpeq__ne,
+  RTLIB::__aeabi_fcmpeq__eq, RTLIB::__aeabi_fcmplt,
+  RTLIB::__aeabi_fcmple, RTLIB::__aeabi_fcmpge,
+  RTLIB::__aeabi_fcmpgt, RTLIB::__aeabi_fcmpun,
+  RTLIB::__aeabi_d2iz,   RTLIB::__aeabi_d2uiz,
+  RTLIB::__aeabi_d2lz,   RTLIB::__aeabi_d2ulz,
+  RTLIB::__aeabi_f2iz,   RTLIB::__aeabi_f2uiz,
+  RTLIB::__aeabi_f2lz,   RTLIB::__aeabi_f2ulz,
+  RTLIB::__aeabi_d2f,RTLIB::__aeabi_d2h,
+  RTLIB::__aeabi_f2d,RTLIB::__aeabi_i2d,
+  RTLIB::__aeabi_ui2d,   RTLIB::__aeabi_l2d,
+  RTLIB::__aeabi_ul2d,   RTLIB::__aeabi_i2f,
+  RTLIB::__aeabi_ui2f,   RTLIB::__aeabi_l2f,
+  RTLIB::__aeabi_ul2f,   RTLIB::__aeabi_lmul,
+  RTLIB::__aeabi_llsl,   RTLIB::__aeabi_llsr,
+  RTLIB::__aeabi_lasr,   RTLIB::__aeabi_idiv__i8,
+  RTLIB::__aeabi_idiv__i16,  RTLIB::__aeabi_idiv__i32,
+  RTLIB::__aeabi_ldivmod,RTLIB::__aeabi_uidiv__i8,
+  RTLIB::__aeabi_uidiv__i16, RTLIB::__aeabi_uidiv__i32,
+  RTLIB::__aeabi_uldivmod,   RTLIB::__aeabi_f2h,
+  RTLIB::__aeabi_d2h,RTLIB::__aeabi_h2f,
+  RTLIB::__aeabi_memcpy, RTLIB::__aeabi_memmove,
+  RTLIB::__aeabi_memset, RTLIB::__aeabi_memcpy4,
+  RTLIB::__aeabi_memcpy8,RTLIB::__aeabi_memmove4,
+  RTLIB::__aeabi_memmove8,   RTLIB::__aeabi_memset4,
+  RTLIB::__aeabi_memset8,RTLIB::__aeabi_memclr,
+  RTLIB::__aeabi_memclr4,RTLIB::__aeabi_memclr8};
+
+  for (RTLIB::LibcallImpl Impl : AAPCS_Libcalls)
+Info.setLibcallImplCallingConv(Impl, CallingConv::ARM_AAPCS);
 }
 
 static void setLongDoubleIsF128Libm(RuntimeLibcallsInfo &Info,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp 
b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 44dcbc9f26616..8c68c6d123514 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -578,9 +578,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine 
&TM_,
   };
   // clang-format on
 
-  for (const auto &LC : LibraryCalls) {
+  for (const auto &LC : LibraryCalls)
 setLibcallImpl(LC.Op, LC.Impl);
-  }
 }
   }
 
@@ -594,94 +593,91 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine 
&TM_,
 static const struct {
   const RTLIB::Libcall Op;
   const RTLIB::LibcallImpl Impl;
-  const CallingConv::ID CC;
 } LibraryCalls[] = {
   // Double-precision floating-point arithmetic helper functions
   // RTABI chapter 4.1.2, Table 2
-  { RTLIB::ADD_F64, RTLIB::__aeabi_dadd, CallingConv::ARM_AAPCS },
-  { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv, CallingConv::ARM_AAPCS },
-  { RTLIB::MUL_F64, RTLIB::__aeabi_dmul, CallingConv::ARM_AAPCS },
-  { RTLIB::SUB_F64, RTLIB::__aeabi_dsub, CallingConv::ARM_AAPCS },
+  { RTLIB::ADD_F64, RTLIB::__aeabi_dadd },
+  { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv },
+  { RTLIB::MUL_F64, RTLIB::__aeabi_dmul },
+  { RTLIB::SUB_F64, RTLIB::__aeabi_dsub },
 
   // Double-precision floating-point comparison helper functions
   // RTABI chapter 4.1.2, Table 3
-  { RTLIB::OEQ_F64, RTLIB::__aeabi_dcmpeq__ne, CallingConv::ARM_AAPCS },
-  { RTLIB::UNE_F64, RTLIB::__aeabi_dcmpeq__eq, CallingConv::ARM_AAPCS },
-  {

[llvm-branch-commits] [llvm] ARM: Unconditionally set eabi libcall calling convs in RuntimeLibcalls (PR #146083)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/146083

>From c61003aede9ddd5db0503428f4dc500718e85028 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 23 Jun 2025 20:14:11 +0900
Subject: [PATCH 1/3] ARM: Unconditionally set eabi libcall calling convs in
 RuntimeLibcalls

This fully consolidates all the calling convention configuration into
RuntimeLibcallInfo. I'm assuming that __aeabi functions have a universal
calling convention, and on other ABIs just don't use them. This will
enable splitting of RuntimeLibcallInfo into the ABI and lowering component.
---
 llvm/lib/IR/RuntimeLibcalls.cpp |  39 ++
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 159 +++-
 2 files changed, 110 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index dec766698dc1d..e62743860d53d 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -98,6 +98,45 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, 
const Triple &TT,
 Info.setLibcallImpl(RTLIB::SDIVREM_I32, RTLIB::__divmodsi4);
 Info.setLibcallImpl(RTLIB::UDIVREM_I32, RTLIB::__udivmodsi4);
   }
+
+  static const RTLIB::LibcallImpl AAPCS_Libcalls[] = {
+  RTLIB::__aeabi_dadd,   RTLIB::__aeabi_ddiv,
+  RTLIB::__aeabi_dmul,   RTLIB::__aeabi_dsub,
+  RTLIB::__aeabi_dcmpeq__ne, RTLIB::__aeabi_dcmpeq__eq,
+  RTLIB::__aeabi_dcmplt, RTLIB::__aeabi_dcmple,
+  RTLIB::__aeabi_dcmpge, RTLIB::__aeabi_dcmpgt,
+  RTLIB::__aeabi_dcmpun, RTLIB::__aeabi_fadd,
+  RTLIB::__aeabi_fdiv,   RTLIB::__aeabi_fmul,
+  RTLIB::__aeabi_fsub,   RTLIB::__aeabi_fcmpeq__ne,
+  RTLIB::__aeabi_fcmpeq__eq, RTLIB::__aeabi_fcmplt,
+  RTLIB::__aeabi_fcmple, RTLIB::__aeabi_fcmpge,
+  RTLIB::__aeabi_fcmpgt, RTLIB::__aeabi_fcmpun,
+  RTLIB::__aeabi_d2iz,   RTLIB::__aeabi_d2uiz,
+  RTLIB::__aeabi_d2lz,   RTLIB::__aeabi_d2ulz,
+  RTLIB::__aeabi_f2iz,   RTLIB::__aeabi_f2uiz,
+  RTLIB::__aeabi_f2lz,   RTLIB::__aeabi_f2ulz,
+  RTLIB::__aeabi_d2f,RTLIB::__aeabi_d2h,
+  RTLIB::__aeabi_f2d,RTLIB::__aeabi_i2d,
+  RTLIB::__aeabi_ui2d,   RTLIB::__aeabi_l2d,
+  RTLIB::__aeabi_ul2d,   RTLIB::__aeabi_i2f,
+  RTLIB::__aeabi_ui2f,   RTLIB::__aeabi_l2f,
+  RTLIB::__aeabi_ul2f,   RTLIB::__aeabi_lmul,
+  RTLIB::__aeabi_llsl,   RTLIB::__aeabi_llsr,
+  RTLIB::__aeabi_lasr,   RTLIB::__aeabi_idiv__i8,
+  RTLIB::__aeabi_idiv__i16,  RTLIB::__aeabi_idiv__i32,
+  RTLIB::__aeabi_ldivmod,RTLIB::__aeabi_uidiv__i8,
+  RTLIB::__aeabi_uidiv__i16, RTLIB::__aeabi_uidiv__i32,
+  RTLIB::__aeabi_uldivmod,   RTLIB::__aeabi_f2h,
+  RTLIB::__aeabi_d2h,RTLIB::__aeabi_h2f,
+  RTLIB::__aeabi_memcpy, RTLIB::__aeabi_memmove,
+  RTLIB::__aeabi_memset, RTLIB::__aeabi_memcpy4,
+  RTLIB::__aeabi_memcpy8,RTLIB::__aeabi_memmove4,
+  RTLIB::__aeabi_memmove8,   RTLIB::__aeabi_memset4,
+  RTLIB::__aeabi_memset8,RTLIB::__aeabi_memclr,
+  RTLIB::__aeabi_memclr4,RTLIB::__aeabi_memclr8};
+
+  for (RTLIB::LibcallImpl Impl : AAPCS_Libcalls)
+Info.setLibcallImplCallingConv(Impl, CallingConv::ARM_AAPCS);
 }
 
 static void setLongDoubleIsF128Libm(RuntimeLibcallsInfo &Info,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp 
b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 44dcbc9f26616..8c68c6d123514 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -578,9 +578,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine 
&TM_,
   };
   // clang-format on
 
-  for (const auto &LC : LibraryCalls) {
+  for (const auto &LC : LibraryCalls)
 setLibcallImpl(LC.Op, LC.Impl);
-  }
 }
   }
 
@@ -594,94 +593,91 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine 
&TM_,
 static const struct {
   const RTLIB::Libcall Op;
   const RTLIB::LibcallImpl Impl;
-  const CallingConv::ID CC;
 } LibraryCalls[] = {
   // Double-precision floating-point arithmetic helper functions
   // RTABI chapter 4.1.2, Table 2
-  { RTLIB::ADD_F64, RTLIB::__aeabi_dadd, CallingConv::ARM_AAPCS },
-  { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv, CallingConv::ARM_AAPCS },
-  { RTLIB::MUL_F64, RTLIB::__aeabi_dmul, CallingConv::ARM_AAPCS },
-  { RTLIB::SUB_F64, RTLIB::__aeabi_dsub, CallingConv::ARM_AAPCS },
+  { RTLIB::ADD_F64, RTLIB::__aeabi_dadd },
+  { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv },
+  { RTLIB::MUL_F64, RTLIB::__aeabi_dmul },
+  { RTLIB::SUB_F64, RTLIB::__aeabi_dsub },
 
   // Double-precision floating-point comparison helper functions
   // RTABI chapter 4.1.2, Table 3
-  { RTLIB::OEQ_F64, RTLIB::__aeabi_dcmpeq__ne, CallingConv::ARM_AAPCS },
-  { RTLIB::UNE_F64, RTLIB::__aeabi_dcmpeq__eq, CallingConv::ARM_AAPCS },
-  {

[llvm-branch-commits] [llvm] RuntimeLibcalls: Associate calling convention with libcall impls (PR #144979)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/144979

>From 5a9b9d9720f01a7575f33feb73042eafcfa3f82c Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 17 Jun 2025 16:25:50 +0900
Subject: [PATCH] RuntimeLibcalls: Associate calling convention with libcall
 impls

Instead of associating the libcall with the RTLIB::Libcall, put it
into a table indexed by the RTLIB::LibcallImpl. The LibcallImpls
should contain all ABI details for a particular implementation, not
the abstract Libcall. In the future the wrappers in terms of the
RTLIB::Libcall should be removed.
---
 llvm/include/llvm/CodeGen/TargetLowering.h| 16 -
 llvm/include/llvm/IR/RuntimeLibcalls.h| 32 ++---
 llvm/lib/IR/RuntimeLibcalls.cpp   | 70 +++
 llvm/lib/Target/ARM/ARMISelLowering.cpp   | 18 ++---
 llvm/lib/Target/Lanai/LanaiISelLowering.cpp   |  4 +-
 llvm/lib/Target/MSP430/MSP430ISelLowering.cpp |  3 +-
 6 files changed, 92 insertions(+), 51 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h 
b/llvm/include/llvm/CodeGen/TargetLowering.h
index 420f1d5fb20ca..fee94cc167363 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3559,6 +3559,11 @@ class LLVM_ABI TargetLoweringBase {
 Libcalls.setLibcallImpl(Call, Impl);
   }
 
+  /// Get the libcall impl routine name for the specified libcall.
+  RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const {
+return Libcalls.getLibcallImpl(Call);
+  }
+
   /// Get the libcall routine name for the specified libcall.
   const char *getLibcallName(RTLIB::Libcall Call) const {
 return Libcalls.getLibcallName(Call);
@@ -3581,11 +3586,18 @@ class LLVM_ABI TargetLoweringBase {
   }
 
   /// Set the CallingConv that should be used for the specified libcall.
-  void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
-Libcalls.setLibcallCallingConv(Call, CC);
+  void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) {
+Libcalls.setLibcallImplCallingConv(Call, CC);
+  }
+
+  /// Get the CallingConv that should be used for the specified libcall
+  /// implementation.
+  CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
+return Libcalls.getLibcallImplCallingConv(Call);
   }
 
   /// Get the CallingConv that should be used for the specified libcall.
+  // FIXME: Remove this wrapper and directly use the used LibcallImpl
   CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
 return Libcalls.getLibcallCallingConv(Call);
   }
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h 
b/llvm/include/llvm/IR/RuntimeLibcalls.h
index c8d97bcd2e664..66d11c4cbabb7 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -37,6 +37,10 @@ template <> struct enum_iteration_traits {
   static constexpr bool is_iterable = true;
 };
 
+template <> struct enum_iteration_traits {
+  static constexpr bool is_iterable = true;
+};
+
 namespace RTLIB {
 
 // Return an iterator over all Libcall values.
@@ -44,6 +48,10 @@ static inline auto libcalls() {
   return enum_seq(static_cast(0), RTLIB::UNKNOWN_LIBCALL);
 }
 
+static inline auto libcall_impls() {
+  return enum_seq(static_cast(1), RTLIB::NumLibcallImpls);
+}
+
 /// A simple container for information about the supported runtime calls.
 struct RuntimeLibcallsInfo {
   explicit RuntimeLibcallsInfo(
@@ -76,16 +84,21 @@ struct RuntimeLibcallsInfo {
 return LibcallImpls[Call];
   }
 
-  /// Set the CallingConv that should be used for the specified libcall.
-  // FIXME: This should be a function of RTLIB::LibcallImpl
-  void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
-LibcallCallingConvs[Call] = CC;
+  /// Set the CallingConv that should be used for the specified libcall
+  /// implementation
+  void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) {
+LibcallImplCallingConvs[Call] = CC;
   }
 
-  /// Get the CallingConv that should be used for the specified libcall.
-  // FIXME: This should be a function of RTLIB::LibcallImpl
+  // FIXME: Remove this wrapper in favor of directly using
+  // getLibcallImplCallingConv
   CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
-return LibcallCallingConvs[Call];
+return LibcallImplCallingConvs[LibcallImpls[Call]];
+  }
+
+  /// Get the CallingConv that should be used for the specified libcall.
+  CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
+return LibcallImplCallingConvs[Call];
   }
 
   ArrayRef getLibcallImpls() const {
@@ -130,8 +143,9 @@ struct RuntimeLibcallsInfo {
   static_assert(static_cast(CallingConv::C) == 0,
 "default calling conv should be encoded as 0");
 
-  /// Stores the CallingConv that should be used for each libcall.
-  CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL] = {};
+  /// Stores th

[llvm-branch-commits] [llvm] ARM: Start moving runtime libcalls into tablegen (PR #146084)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/146084

>From 8eb2e09e5f533bbf706445437d7cf5590d775fab Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 23 Jun 2025 23:23:01 +0900
Subject: [PATCH] ARM: Start moving runtime libcalls into tablegen

We still need to manually set the calling conventions of
some libcalls until the lowering is separated out.
---
 llvm/include/llvm/IR/RuntimeLibcalls.h|  2 +-
 llvm/include/llvm/IR/RuntimeLibcalls.td   | 48 
 llvm/lib/IR/RuntimeLibcalls.cpp   | 73 +--
 .../RuntimeLibcallEmitter-calling-conv.td |  2 +-
 llvm/test/TableGen/RuntimeLibcallEmitter.td   |  2 +-
 .../TableGen/Basic/RuntimeLibcallsEmitter.cpp |  2 +-
 6 files changed, 53 insertions(+), 76 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h 
b/llvm/include/llvm/IR/RuntimeLibcalls.h
index ac83df3a4189e..4ea5ff9c9ade8 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -170,7 +170,7 @@ struct RuntimeLibcallsInfo {
   void initDefaultLibCallImpls();
 
   /// Generated by tablegen.
-  void setTargetRuntimeLibcallSets(const Triple &TT);
+  void setTargetRuntimeLibcallSets(const Triple &TT, FloatABI::ABIType 
FloatABI);
 
   /// Set default libcall names. If a target wants to opt-out of a libcall it
   /// should be placed here.
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td 
b/llvm/include/llvm/IR/RuntimeLibcalls.td
index c15ffa0653335..0237c8b41ae8c 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -17,6 +17,7 @@ class DuplicateLibcallImplWithPrefix
 
 /// Libcall Predicates
 def isOSDarwin : RuntimeLibcallPredicate<"TT.isOSDarwin()">;
+def isOSWindows : RuntimeLibcallPredicate<"TT.isOSWindows()">;
 
 def darwinHasSinCosStret : RuntimeLibcallPredicate<"darwinHasSinCosStret(TT)">;
 def darwinHasExp10 : RuntimeLibcallPredicate<"darwinHasExp10(TT)">;
@@ -1272,6 +1273,7 @@ def __aeabi_memclr4 : RuntimeLibcallImpl;
 def __aeabi_memclr8 : RuntimeLibcallImpl;
 
 // isTargetWindows()
+defset list WindowsFPIntCastLibcalls = {
 def __stoi64 : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_VFP
 def __dtoi64 : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_VFP
 def __stou64 : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_VFP
@@ -1280,6 +1282,7 @@ def __i64tos : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_V
 def __i64tod : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_VFP
 def __u64tos : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_VFP
 def __u64tod : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS_VFP
+}
 
 def __rt_sdiv : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS
 def __rt_sdiv64 : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS
@@ -1306,6 +1309,51 @@ def __aeabi_h2f : RuntimeLibcallImpl; // 
CallingConv::ARM_AAPCS
 def __gnu_f2h_ieee : RuntimeLibcallImpl;
 def __gnu_h2f_ieee : RuntimeLibcallImpl;
 
+
+def WindowARMDivRemCalls : LibcallImpls<
+  (add __rt_sdiv, __rt_sdiv64, __rt_udiv, __rt_udiv64),
+  isOSWindows> {
+  let CallingConv = ARM_AAPCS;
+}
+
+def WindowARMFPIntCasts : LibcallImpls<
+  (add WindowsFPIntCastLibcalls),
+  isOSWindows> {
+  let CallingConv = ARM_AAPCS_VFP;
+}
+
+
+// Register based DivRem for AEABI (RTABI 4.2)
+def AEABIDivRemCalls : LibcallImpls<
+  (add __aeabi_idivmod, __aeabi_ldivmod,
+   __aeabi_uidivmod, __aeabi_uldivmod),
+  RuntimeLibcallPredicate<[{TT.isTargetAEABI() || TT.isAndroid() || 
TT.isTargetGNUAEABI() ||
+TT.isTargetMuslAEABI()}]>> {
+  let CallingConv = ARM_AAPCS;
+}
+
+def isARMOrThumb : RuntimeLibcallPredicate<"TT.isARM() || TT.isThumb()">;
+
+def ARMSystemLibrary
+: SystemRuntimeLibrary>)> {
+  let DefaultLibcallCallingConv = LibcallCallingConv<[{
+ (!TT.isOSDarwin() && !TT.isiOS() && !TT.isWatchOS() && !TT.isDriverKit()) 
?
+(FloatABI == FloatABI::Hard ? CallingConv::ARM_AAPCS_VFP
+: CallingConv::ARM_AAPCS) :
+  CallingConv::C
+  }]>;
+}
+
 
//===--===//
 // AVR Runtime Libcalls
 
//===--===//
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index c2d0b0684ec39..2d168befd145c 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -25,77 +25,6 @@ static cl::opt
 static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT,
FloatABI::ABIType FloatABIType,
EABI EABIVersion) {
-  if (!TT.isOSDarwin() && !TT.isiOS() && !TT.isWatchOS() && !TT.isDriverKit()) 
{
-CallingConv::ID DefaultCC = FloatABIType == FloatABI::Hard
-? CallingConv::ARM_AAPCS_VFP
-: CallingConv::ARM_AAPCS;
-for (RTLIB::LibcallImpl LC : RTLIB::libcall_i

[llvm-branch-commits] [llvm] RuntimeLibcalls: Remove table of soft float compare cond codes (PR #146082)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/146082

>From effe1ad6d053a4dffccc3d68574868565ce94397 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 23 Jun 2025 19:10:30 +0900
Subject: [PATCH] RuntimeLibcalls: Remove table of soft float compare cond
 codes

Previously we had a table of entries for every Libcall for
the comparison to use against an integer 0 if it was a soft
float compare function. This was only relevant to a handful of
opcodes, so it was wasteful. Now that we can distinguish the
abstract libcall for the compare with the concrete implementation,
we can just directly hardcode the comparison against the libcall
impl without this configuration system.
---
 .../include/llvm/CodeGen/RuntimeLibcallUtil.h |   3 -
 llvm/include/llvm/CodeGen/TargetLowering.h|  17 +-
 llvm/include/llvm/IR/RuntimeLibcalls.h|  32 +---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  16 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp   | 107 +++
 llvm/lib/IR/RuntimeLibcalls.cpp   |  31 ---
 llvm/lib/Target/ARM/ARMISelLowering.cpp   | 176 +-
 7 files changed, 182 insertions(+), 200 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h 
b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
index 7481ed5b80b3f..09a8151e9ec9c 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
@@ -105,9 +105,6 @@ LLVM_ABI Libcall 
getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
 /// UNKNOW_LIBCALL if there is none.
 LLVM_ABI Libcall getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
 
-/// Initialize the default condition code on the libcalls.
-LLVM_ABI void initCmpLibcallCCs(ISD::CondCode *CmpLibcallCCs);
-
 } // namespace RTLIB
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h 
b/llvm/include/llvm/CodeGen/TargetLowering.h
index fee94cc167363..fa46d296bf533 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3571,19 +3571,10 @@ class LLVM_ABI TargetLoweringBase {
 
   const char *getMemcpyName() const { return Libcalls.getMemcpyName(); }
 
-  /// Override the default CondCode to be used to test the result of the
-  /// comparison libcall against zero.
-  /// FIXME: This should be removed
-  void setCmpLibcallCC(RTLIB::Libcall Call, CmpInst::Predicate Pred) {
-Libcalls.setSoftFloatCmpLibcallPredicate(Call, Pred);
-  }
-
-  /// Get the CondCode that's to be used to test the result of the comparison
-  /// libcall against zero.
-  CmpInst::Predicate
-  getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const {
-return Libcalls.getSoftFloatCmpLibcallPredicate(Call);
-  }
+  /// Get the comparison predicate that's to be used to test the result of the
+  /// comparison libcall against zero. This should only be used with
+  /// floating-point compare libcalls.
+  ISD::CondCode getSoftFloatCmpLibcallPredicate(RTLIB::LibcallImpl Call) const;
 
   /// Set the CallingConv that should be used for the specified libcall.
   void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) {
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h 
b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 66d11c4cbabb7..ac83df3a4189e 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -59,7 +59,6 @@ struct RuntimeLibcallsInfo {
   ExceptionHandling ExceptionModel = ExceptionHandling::None,
   FloatABI::ABIType FloatABI = FloatABI::Default,
   EABI EABIVersion = EABI::Default, StringRef ABIName = "") {
-initSoftFloatCmpLibcallPredicates();
 initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion, ABIName);
   }
 
@@ -106,22 +105,6 @@ struct RuntimeLibcallsInfo {
 return ArrayRef(LibcallImpls).drop_front();
   }
 
-  /// Get the comparison predicate that's to be used to test the result of the
-  /// comparison libcall against zero. This should only be used with
-  /// floating-point compare libcalls.
-  // FIXME: This should be a function of RTLIB::LibcallImpl
-  CmpInst::Predicate
-  getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const {
-return SoftFloatCompareLibcallPredicates[Call];
-  }
-
-  // FIXME: This should be removed. This should be private constant.
-  // FIXME: This should be a function of RTLIB::LibcallImpl
-  void setSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call,
-   CmpInst::Predicate Pred) {
-SoftFloatCompareLibcallPredicates[Call] = Pred;
-  }
-
   /// Return a function name compatible with RTLIB::MEMCPY, or nullptr if fully
   /// unsupported.
   const char *getMemcpyName() const {
@@ -132,6 +115,11 @@ struct RuntimeLibcallsInfo {
 return getLibcallName(RTLIB::MEMMOVE);
   }
 
+  /// Return the libcall provided by \p Impl
+  static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) {
+return ImplToLibcall[Impl];
+  }
+
 priva

[llvm-branch-commits] [llvm] [mlir] [mlir][GPU][transform] Add gpu_to_rocdl conversion pattern to transfo… (PR #146962)

2025-07-07 Thread Oleksandr Alex Zinenko via llvm-branch-commits

ftynse wrote:

Actually, re:

> Authored-by: Son Tuan Vu [vu...@google.com](mailto:vu...@google.com)

I think you should reupload after patching the commit to specify the correct 
author `git commit --amend --author="..."`. Github tracks this correctly.

https://github.com/llvm/llvm-project/pull/146962
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits


@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1(
 ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f

arsenm wrote:

These should always be printed with the named counter syntax 

https://github.com/llvm/llvm-project/pull/147257
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread Pierre van Houtryve via llvm-branch-commits


@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1(
 ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f

Pierre-vh wrote:

That's unexpected right ? Same for the vmcnt wait above

https://github.com/llvm/llvm-project/pull/147257
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm edited 
https://github.com/llvm/llvm-project/pull/147257
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)

2025-07-07 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/141589

>From d906a978145aabae8b2d1a029477d5a08272ae8c Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Tue, 27 May 2025 11:16:16 +0200
Subject: [PATCH 1/4] [AMDGPU] Move S_BFE lowering into RegBankCombiner

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td   |  14 +-
 .../Target/AMDGPU/AMDGPURegBankCombiner.cpp   |  51 +++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  | 125 --
 3 files changed, 119 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9587fad1ecd63..94e1175b06b14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[
   canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
 ]>;
 
+// Early select of uniform BFX into S_BFE instructions.
+// These instructions encode the offset/width in a way that requires using
+// bitwise operations. Selecting these instructions early allow the combiner
+// to potentially fold these.
+class lower_uniform_bfx : GICombineRule<
+  (defs root:$bfx),
+  (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); 
}])>;
+
+def lower_uniform_sbfx : lower_uniform_bfx;
+def lower_uniform_ubfx : lower_uniform_bfx;
+
 let Predicates = [Has16BitInsts, NotHasMed3_16] in {
 // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
 // saves one instruction compared to the promotion.
@@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
-   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
+   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
+   lower_uniform_sbfx, lower_uniform_ubfx]> {
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index ee324a5e93f0f..2100900bb8eb2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner {
 
   void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) 
const;
 
+  bool lowerUniformBFX(MachineInstr &MI) const;
+
 private:
   SIModeRegisterDefaults getMode() const;
   bool getIEEE() const;
@@ -392,6 +394,55 @@ void 
AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
   MI.eraseFromParent();
 }
 
+bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_UBFX ||
+ MI.getOpcode() == TargetOpcode::G_SBFX);
+  const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI);
+  assert(RB && "No RB?");
+  if (RB->getID() != AMDGPU::SGPRRegBankID)
+return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register OffsetReg = MI.getOperand(2).getReg();
+  Register WidthReg = MI.getOperand(3).getReg();
+
+  const LLT S32 = LLT::scalar(32);
+  LLT Ty = MRI.getType(DstReg);
+
+  const unsigned Opc = (Ty == S32)
+   ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
+   : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+
+  // Ensure the high bits are clear to insert the offset.
+  auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6));
+  auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
+
+  // Zeros out the low bits, so don't bother clamping the input value.
+  auto ShiftAmt = B.buildConstant(S32, 16);
+  auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt);
+
+  // Transformation function, pack the offset and width of a BFE into
+  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+  // source, bits [5:0] contain the offset and bits [22:16] the width.
+  auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+
+  MRI.setRegBank(OffsetMask.getReg(0), *RB);
+  MRI.setRegBank(ClampOffset.getReg(0), *RB);
+  MRI.setRegBank(ShiftAmt.getReg(0), *RB);
+  MRI.setRegBank(ShiftWidth.getReg(0), *RB);
+  MRI.setRegBank(MergedInputs.getReg(0), *RB);
+
+  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+  if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
+llvm_unreachable("failed to constrain BFE");
+
+  MI.eraseFromParent();
+  return true;
+}
+
 SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
   return MF.getInfo()->getMode();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 6874657a4ffe7..140c2babb013f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/li

[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)

2025-07-07 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/141591

>From b386d126b9f560bf203fd044d81575ddfad2a8c6 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Tue, 27 May 2025 12:29:02 +0200
Subject: [PATCH 1/2] [AMDGPU] Add KnownBits simplification combines to
 RegBankCombiner

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td   |  3 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   | 59 -
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 61 +++---
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 63 +++
 llvm/test/CodeGen/AMDGPU/div_i128.ll  | 30 -
 llvm/test/CodeGen/AMDGPU/itofp.i128.ll| 11 ++--
 llvm/test/CodeGen/AMDGPU/lround.ll| 18 +++---
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   | 16 +
 8 files changed, 104 insertions(+), 157 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 96be17c487130..df867aaa204b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner<
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
-   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> {
+   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract,
+   known_bits_simplifications]> {
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 6baa10bb48621..cc0f45681a3e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-LABEL: v_lshr_i65_33:
 ; GFX6:   ; %bb.0:
 ; GFX6-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:v_mov_b32_e32 v3, v1
-; GFX6-NEXT:v_mov_b32_e32 v0, 1
+; GFX6-NEXT:v_mov_b32_e32 v3, 1
+; GFX6-NEXT:v_mov_b32_e32 v4, 0
+; GFX6-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX6-NEXT:v_lshl_b64 v[2:3], v[3:4], 31
+; GFX6-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX6-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:v_mov_b32_e32 v1, 0
-; GFX6-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX6-NEXT:v_lshl_b64 v[0:1], v[0:1], 31
-; GFX6-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX6-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i65_33:
 ; GFX8:   ; %bb.0:
 ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:v_mov_b32_e32 v3, v1
-; GFX8-NEXT:v_mov_b32_e32 v0, 1
+; GFX8-NEXT:v_mov_b32_e32 v3, 1
+; GFX8-NEXT:v_mov_b32_e32 v4, 0
+; GFX8-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX8-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX8-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX8-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:v_mov_b32_e32 v1, 0
-; GFX8-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX8-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX8-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX8-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i65_33:
 ; GFX9:   ; %bb.0:
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:v_mov_b32_e32 v3, v1
-; GFX9-NEXT:v_mov_b32_e32 v0, 1
+; GFX9-NEXT:v_mov_b32_e32 v3, 1
+; GFX9-NEXT:v_mov_b32_e32 v4, 0
+; GFX9-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX9-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX9-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX9-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:v_mov_b32_e32 v1, 0
-; GFX9-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX9-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX9-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX9-NEXT:v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_i65_33:
 ; GFX10:   ; %bb.0:
 ; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:v_mov_b32_e32 v3, v1
-; GFX10-NEXT:v_mov_b32_e32 v0, 1
+; GFX10-NEXT:v_mov_b32_e32 v3, 1
+; GFX10-NEXT:v_mov_b32_e32 v4, 0
+; GFX10-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX10-NEXT:v_lshrrev_b32_e32 v0, 1, v1
 ; GFX10-NEXT:v_mov_b32_e32 v1, 0
-; GFX10-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX10-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX10-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX10-NEXT:v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX10-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_lshr_i65_33:
 ; GFX11:   ; %bb.0:
 ; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1
-; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_an

[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)

2025-07-07 Thread Pierre van Houtryve via llvm-branch-commits

https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/141591

>From b386d126b9f560bf203fd044d81575ddfad2a8c6 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Tue, 27 May 2025 12:29:02 +0200
Subject: [PATCH 1/2] [AMDGPU] Add KnownBits simplification combines to
 RegBankCombiner

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td   |  3 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   | 59 -
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 61 +++---
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 63 +++
 llvm/test/CodeGen/AMDGPU/div_i128.ll  | 30 -
 llvm/test/CodeGen/AMDGPU/itofp.i128.ll| 11 ++--
 llvm/test/CodeGen/AMDGPU/lround.ll| 18 +++---
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   | 16 +
 8 files changed, 104 insertions(+), 157 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 96be17c487130..df867aaa204b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner<
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
-   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> {
+   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract,
+   known_bits_simplifications]> {
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 6baa10bb48621..cc0f45681a3e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-LABEL: v_lshr_i65_33:
 ; GFX6:   ; %bb.0:
 ; GFX6-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:v_mov_b32_e32 v3, v1
-; GFX6-NEXT:v_mov_b32_e32 v0, 1
+; GFX6-NEXT:v_mov_b32_e32 v3, 1
+; GFX6-NEXT:v_mov_b32_e32 v4, 0
+; GFX6-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX6-NEXT:v_lshl_b64 v[2:3], v[3:4], 31
+; GFX6-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX6-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:v_mov_b32_e32 v1, 0
-; GFX6-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX6-NEXT:v_lshl_b64 v[0:1], v[0:1], 31
-; GFX6-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX6-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i65_33:
 ; GFX8:   ; %bb.0:
 ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:v_mov_b32_e32 v3, v1
-; GFX8-NEXT:v_mov_b32_e32 v0, 1
+; GFX8-NEXT:v_mov_b32_e32 v3, 1
+; GFX8-NEXT:v_mov_b32_e32 v4, 0
+; GFX8-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX8-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX8-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX8-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:v_mov_b32_e32 v1, 0
-; GFX8-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX8-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX8-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX8-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i65_33:
 ; GFX9:   ; %bb.0:
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:v_mov_b32_e32 v3, v1
-; GFX9-NEXT:v_mov_b32_e32 v0, 1
+; GFX9-NEXT:v_mov_b32_e32 v3, 1
+; GFX9-NEXT:v_mov_b32_e32 v4, 0
+; GFX9-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX9-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX9-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX9-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:v_mov_b32_e32 v1, 0
-; GFX9-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX9-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX9-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX9-NEXT:v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_i65_33:
 ; GFX10:   ; %bb.0:
 ; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:v_mov_b32_e32 v3, v1
-; GFX10-NEXT:v_mov_b32_e32 v0, 1
+; GFX10-NEXT:v_mov_b32_e32 v3, 1
+; GFX10-NEXT:v_mov_b32_e32 v4, 0
+; GFX10-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX10-NEXT:v_lshrrev_b32_e32 v0, 1, v1
 ; GFX10-NEXT:v_mov_b32_e32 v1, 0
-; GFX10-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX10-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX10-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX10-NEXT:v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX10-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_lshr_i65_33:
 ; GFX11:   ; %bb.0:
 ; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1
-; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_an

[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread Sameer Sahasrabuddhe via llvm-branch-commits


@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1(
 ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f

ssahasra wrote:

> These should always be printed with the named counter syntax

I haven't check what's different about this wait count for it to be printed 
like this. Will need to follow it up as a separate change.

https://github.com/llvm/llvm-project/pull/147257
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread Sameer Sahasrabuddhe via llvm-branch-commits


@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1(
 ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f

ssahasra wrote:

If we agree with the basic design, then these are expected. There's a whole 
bunch of tests that either stop at the memory legalizer, or they run llc with 
`-O0`, like this one. The "trivial" wait counts show up in all these tests 
because SIInsertWaitcnts did not get a chance to clean it up. In particular, 
see how `TrySimplify` in that pass controls whether or not to clean up these 
wait counts. They disappear in the optimized ISA output.

https://github.com/llvm/llvm-project/pull/147257
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread Pierre van Houtryve via llvm-branch-commits


@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1(
 ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f

Pierre-vh wrote:

The waitcnts aren't optimized out at O0 because we want to see them in memory 
legalizer tests, however we're mostly interested in the waitcnt zero, not the 
waitcnt ~0
We could still optimize out the ~0 ones, I don't think there is a downside to 
that

https://github.com/llvm/llvm-project/pull/147257
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread Sameer Sahasrabuddhe via llvm-branch-commits


@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1(
 ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f

ssahasra wrote:

Yes, I did consider that as an option. But there is the hypothetical corner 
case where the memory legalizer might deliberately compute the wait count to be 
so large that it gets clamped at the max value (not the same as ~0, strictly 
speaking). If that is not an issue, it will significantly reduce the diff for 
tests that don't stop after the legalizer.

https://github.com/llvm/llvm-project/pull/147257
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] Triple: Record default exception handling type (PR #147225)

2025-07-07 Thread Joseph Huber via llvm-branch-commits

https://github.com/jhuber6 approved this pull request.


https://github.com/llvm/llvm-project/pull/147225
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] ARM: Remove subtarget field tracking SjLj (PR #147226)

2025-07-07 Thread Daniel Kiss via llvm-branch-commits

https://github.com/DanielKristofKiss approved this pull request.

lgtm

https://github.com/llvm/llvm-project/pull/147226
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] Triple: Record default exception handling type (PR #147225)

2025-07-07 Thread Daniel Kiss via llvm-branch-commits

https://github.com/DanielKristofKiss approved this pull request.


https://github.com/llvm/llvm-project/pull/147225
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread Sameer Sahasrabuddhe via llvm-branch-commits

ssahasra wrote:

This is part of a stack:

- #147258
- #147257 
- #147256 

https://github.com/llvm/llvm-project/pull/147257
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] efficiently wait for direct loads to LDS at all scopes (PR #147258)

2025-07-07 Thread Sameer Sahasrabuddhe via llvm-branch-commits

ssahasra wrote:

This is part of a stack:

- #147258
- #147257 
- #147256 

https://github.com/llvm/llvm-project/pull/147258
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] 902eaa1 - Revert "[clang][modules-driver] Add scanner to detect C++20 module presence (…"

2025-07-07 Thread via llvm-branch-commits

Author: Corentin Jabot
Date: 2025-07-07T14:05:22+02:00
New Revision: 902eaa1d5e24beb2a7129c61dfa84759a668f62f

URL: 
https://github.com/llvm/llvm-project/commit/902eaa1d5e24beb2a7129c61dfa84759a668f62f
DIFF: 
https://github.com/llvm/llvm-project/commit/902eaa1d5e24beb2a7129c61dfa84759a668f62f.diff

LOG: Revert "[clang][modules-driver] Add scanner to detect C++20 module 
presence (…"

This reverts commit ded142671663c404f4d9fb9ef4867b4fc680409a.

Added: 


Modified: 
clang/include/clang/Basic/DiagnosticDriverKinds.td
clang/include/clang/Basic/DiagnosticGroups.td
clang/include/clang/Driver/Driver.h
clang/include/clang/Driver/Options.td
clang/lib/Driver/Driver.cpp
clang/test/Frontend/warning-options.cpp

Removed: 
clang/test/Driver/modules-driver-cxx20-module-usage-scanner.cpp



diff  --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td 
b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index efba09dc140f6..34b6c0d7a8acd 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -577,16 +577,6 @@ def err_drv_reduced_module_output_overrided : Warning<
   "please consider use '-fmodule-output=' to specify the output file for 
reduced BMI explicitly">,
   InGroup>;
 
-def remark_found_cxx20_module_usage : Remark<
-  "found C++20 module usage in file '%0'">,
-  InGroup;
-def remark_performing_driver_managed_module_build : Remark<
-  "performing driver managed module build">,
-  InGroup;
-def warn_modules_driver_unsupported_standard : Warning<
-  "'-fmodules-driver' is not supported before C++20">,
-  InGroup;
-
 def warn_drv_delayed_template_parsing_after_cxx20 : Warning<
   "-fdelayed-template-parsing is deprecated after C++20">,
   InGroup>;

diff  --git a/clang/include/clang/Basic/DiagnosticGroups.td 
b/clang/include/clang/Basic/DiagnosticGroups.td
index d2aa380f8d73f..36fa3227fd6a6 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -625,7 +625,6 @@ def ModuleConflict : DiagGroup<"module-conflict">;
 def ModuleFileExtension : DiagGroup<"module-file-extension">;
 def ModuleIncludeDirectiveTranslation : 
DiagGroup<"module-include-translation">;
 def ModuleMap : DiagGroup<"module-map">;
-def ModulesDriver : DiagGroup<"modules-driver">;
 def RoundTripCC1Args : DiagGroup<"round-trip-cc1-args">;
 def NewlineEOF : DiagGroup<"newline-eof">;
 def Nullability : DiagGroup<"nullability">;

diff  --git a/clang/include/clang/Driver/Driver.h 
b/clang/include/clang/Driver/Driver.h
index 2a04f6dd27655..d9e328fe918bc 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -504,9 +504,6 @@ class Driver {
 
   /// BuildActions - Construct the list of actions to perform for the
   /// given arguments, which are only done for a single architecture.
-  /// If the compilation is an explicit module build, delegates to
-  /// BuildDriverManagedModuleBuildActions. Otherwise, BuildDefaultActions is
-  /// used.
   ///
   /// \param C - The compilation that is being built.
   /// \param Args - The input arguments.
@@ -792,35 +789,6 @@ class Driver {
   /// compilation based on which -f(no-)?lto(=.*)? option occurs last.
   void setLTOMode(const llvm::opt::ArgList &Args);
 
-  /// BuildDefaultActions - Constructs the list of actions to perform
-  /// for the provided arguments, which are only done for a single 
architecture.
-  ///
-  /// \param C - The compilation that is being built.
-  /// \param Args - The input arguments.
-  /// \param Actions - The list to store the resulting actions onto.
-  void BuildDefaultActions(Compilation &C, llvm::opt::DerivedArgList &Args,
-   const InputList &Inputs, ActionList &Actions) const;
-
-  /// BuildDriverManagedModuleBuildActions - Performs a dependency
-  /// scan and constructs the list of actions to perform for dependency order
-  /// and the provided arguments. This is only done for a single a 
architecture.
-  ///
-  /// \param C - The compilation that is being built.
-  /// \param Args - The input arguments.
-  /// \param Actions - The list to store the resulting actions onto.
-  void BuildDriverManagedModuleBuildActions(Compilation &C,
-llvm::opt::DerivedArgList &Args,
-const InputList &Inputs,
-ActionList &Actions) const;
-
-  /// Scans the leading lines of the C++ source inputs to detect C++20 module
-  /// usage.
-  ///
-  /// \returns True if module usage is detected, false otherwise, or an error 
on
-  /// read failure.
-  llvm::ErrorOr
-  ScanInputsForCXXModuleUsage(const InputList &Inputs) const;
-
   /// Retrieves a ToolChain for a particular \p Target triple.
   ///
   /// Will cache ToolChains for the life of the driver object, and create them

[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)

2025-07-07 Thread Pierre van Houtryve via llvm-branch-commits


@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1(
 ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f

Pierre-vh wrote:

That's a valid concern, though the MemoryLegalizer currently only inserts 
waitcnts 0, I think?
I also don't see why the memory legalizer would insert non-zero soft waitcnts, 
I think those would need to be non-soft (but that's not enforced anywhere, 
afaik)

https://github.com/llvm/llvm-project/pull/147257
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [LifetimeSafety] Implement dataflow analysis for loan propagation (PR #147295)

2025-07-07 Thread Utkarsh Saxena via llvm-branch-commits

https://github.com/usx95 updated 
https://github.com/llvm/llvm-project/pull/147295

>From e870b040c4ef29b7ca2e50c1fc0ab5a2446f5cf6 Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena 
Date: Sun, 6 Jul 2025 19:12:55 +
Subject: [PATCH] [LifetimeSafety] Propagate loans using dataflow analysis

---
 clang/lib/Analysis/LifetimeSafety.cpp | 258 +-
 .../Sema/warn-lifetime-safety-dataflow.cpp| 186 +
 2 files changed, 443 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Analysis/LifetimeSafety.cpp 
b/clang/lib/Analysis/LifetimeSafety.cpp
index 2c2309de90e26..e881e592ef59f 100644
--- a/clang/lib/Analysis/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety.cpp
@@ -13,7 +13,10 @@
 #include "clang/Analysis/Analyses/PostOrderCFGView.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/DataflowWorklist.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/ImmutableMap.h"
+#include "llvm/ADT/ImmutableSet.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
@@ -482,7 +485,247 @@ class FactGenerator : public 
ConstStmtVisitor {
 };
 
 // = //
-//  TODO: Run dataflow analysis to propagate loans, analyse and error 
reporting.
+//  The Dataflow Lattice
+// = //
+
+// Using LLVM's immutable collections is efficient for dataflow analysis
+// as it avoids deep copies during state transitions.
+// TODO(opt): Consider using a bitset to represent the set of loans.
+using LoanSet = llvm::ImmutableSet;
+using OriginLoanMap = llvm::ImmutableMap;
+
+/// An object to hold the factories for immutable collections, ensuring
+/// that all created states share the same underlying memory management.
+struct LifetimeFactory {
+  OriginLoanMap::Factory OriginMapFact;
+  LoanSet::Factory LoanSetFact;
+
+  LoanSet createLoanSet(LoanID LID) {
+return LoanSetFact.add(LoanSetFact.getEmptySet(), LID);
+  }
+};
+
+/// LifetimeLattice represents the state of our analysis at a given program
+/// point. It is an immutable object, and all operations produce a new
+/// instance rather than modifying the existing one.
+struct LifetimeLattice {
+  /// The map from an origin to the set of loans it contains.
+  /// TODO(opt): To reduce the lattice size, propagate origins of declarations,
+  /// not expressions, because expressions are not visible across blocks.
+  OriginLoanMap Origins = OriginLoanMap(nullptr);
+
+  explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {}
+  LifetimeLattice() = default;
+
+  bool operator==(const LifetimeLattice &Other) const {
+return Origins == Other.Origins;
+  }
+  bool operator!=(const LifetimeLattice &Other) const {
+return !(*this == Other);
+  }
+
+  LoanSet getLoans(OriginID OID, LifetimeFactory &Factory) const {
+if (auto *Loans = Origins.lookup(OID))
+  return *Loans;
+return Factory.LoanSetFact.getEmptySet();
+  }
+
+  /// Computes the union of two lattices by performing a key-wise join of
+  /// their OriginLoanMaps.
+  // TODO(opt): This key-wise join is a performance bottleneck. A more
+  // efficient merge could be implemented using a Patricia Trie or HAMT
+  // instead of the current AVL-tree-based ImmutableMap.
+  LifetimeLattice join(const LifetimeLattice &Other,
+   LifetimeFactory &Factory) const {
+/// Merge the smaller map into the larger one ensuring we iterate over the
+/// smaller map.
+if (Origins.getHeight() < Other.Origins.getHeight())
+  return Other.join(*this, Factory);
+
+OriginLoanMap JoinedState = Origins;
+// For each origin in the other map, union its loan set with ours.
+for (const auto &Entry : Other.Origins) {
+  OriginID OID = Entry.first;
+  LoanSet OtherLoanSet = Entry.second;
+  JoinedState = Factory.OriginMapFact.add(
+  JoinedState, OID,
+  join(getLoans(OID, Factory), OtherLoanSet, Factory));
+}
+return LifetimeLattice(JoinedState);
+  }
+
+  LoanSet join(LoanSet a, LoanSet b, LifetimeFactory &Factory) const {
+/// Merge the smaller set into the larger one ensuring we iterate over the
+/// smaller set.
+if (a.getHeight() < b.getHeight())
+  std::swap(a, b);
+LoanSet Result = a;
+for (LoanID LID : b) {
+  /// TODO(opt): Profiling shows that this loop is a major performance
+  /// bottleneck. Investigate using a BitVector to represent the set of
+  /// loans for improved join performance.
+  Result = Factory.LoanSetFact.add(Result, LID);
+}
+return Result;
+  }
+
+  void dump(llvm::raw_ostream &OS) const {
+OS << "LifetimeLattice State:\n";
+if (Origins.isEmpty())
+  OS << "  \n";
+for (const auto &Entry : Origins) {
+  if (Entry.second.isEmpty())
+OS

[llvm-branch-commits] [clang] [LifetimeSafety] Implement dataflow analysis for loan propagation (PR #147295)

2025-07-07 Thread Utkarsh Saxena via llvm-branch-commits

https://github.com/usx95 edited https://github.com/llvm/llvm-project/pull/147295
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [LifetimeSafety] Implement dataflow analysis for loan propagation (PR #147295)

2025-07-07 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-clang-analysis

Author: Utkarsh Saxena (usx95)


Changes

This patch introduces the core dataflow analysis infrastructure for the C++ 
Lifetime Safety checker. This change implements the logic to propagate "loan" 
information across the control-flow graph. The primary goal is to compute a 
fixed-point state that accurately models which pointer (Origin) can hold which 
borrow (Loan) at any given program point.

Key components 

* `LifetimeLattice`: Defines the dataflow state, mapping an `OriginID` to a 
`LoanSet` using `llvm::ImmutableMap`.

* `Transferer`: Implements the transfer function, which updates the 
`LifetimeLattice` by applying the lifetime facts (Issue, AssignOrigin, etc.) 
generated for each basic block.

* `LifetimeDataflow`: A forward dataflow analysis driver that uses a worklist 
algorithm to iterate over the CFG until the lattice state converges.

The existing test suite has been extended to check the final dataflow results.

This work is a prerequisite for the final step of the analysis: consuming these 
results to identify and report lifetime violations.

---

Patch is 20.87 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/147295.diff


2 Files Affected:

- (modified) clang/lib/Analysis/LifetimeSafety.cpp (+257-1) 
- (modified) clang/test/Sema/warn-lifetime-safety-dataflow.cpp (+186) 


``diff
diff --git a/clang/lib/Analysis/LifetimeSafety.cpp 
b/clang/lib/Analysis/LifetimeSafety.cpp
index 2c2309de90e26..e881e592ef59f 100644
--- a/clang/lib/Analysis/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety.cpp
@@ -13,7 +13,10 @@
 #include "clang/Analysis/Analyses/PostOrderCFGView.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/DataflowWorklist.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/ImmutableMap.h"
+#include "llvm/ADT/ImmutableSet.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
@@ -482,7 +485,247 @@ class FactGenerator : public 
ConstStmtVisitor {
 };
 
 // = //
-//  TODO: Run dataflow analysis to propagate loans, analyse and error 
reporting.
+//  The Dataflow Lattice
+// = //
+
+// Using LLVM's immutable collections is efficient for dataflow analysis
+// as it avoids deep copies during state transitions.
+// TODO(opt): Consider using a bitset to represent the set of loans.
+using LoanSet = llvm::ImmutableSet;
+using OriginLoanMap = llvm::ImmutableMap;
+
+/// An object to hold the factories for immutable collections, ensuring
+/// that all created states share the same underlying memory management.
+struct LifetimeFactory {
+  OriginLoanMap::Factory OriginMapFact;
+  LoanSet::Factory LoanSetFact;
+
+  LoanSet createLoanSet(LoanID LID) {
+return LoanSetFact.add(LoanSetFact.getEmptySet(), LID);
+  }
+};
+
+/// LifetimeLattice represents the state of our analysis at a given program
+/// point. It is an immutable object, and all operations produce a new
+/// instance rather than modifying the existing one.
+struct LifetimeLattice {
+  /// The map from an origin to the set of loans it contains.
+  /// TODO(opt): To reduce the lattice size, propagate origins of declarations,
+  /// not expressions, because expressions are not visible across blocks.
+  OriginLoanMap Origins = OriginLoanMap(nullptr);
+
+  explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {}
+  LifetimeLattice() = default;
+
+  bool operator==(const LifetimeLattice &Other) const {
+return Origins == Other.Origins;
+  }
+  bool operator!=(const LifetimeLattice &Other) const {
+return !(*this == Other);
+  }
+
+  LoanSet getLoans(OriginID OID, LifetimeFactory &Factory) const {
+if (auto *Loans = Origins.lookup(OID))
+  return *Loans;
+return Factory.LoanSetFact.getEmptySet();
+  }
+
+  /// Computes the union of two lattices by performing a key-wise join of
+  /// their OriginLoanMaps.
+  // TODO(opt): This key-wise join is a performance bottleneck. A more
+  // efficient merge could be implemented using a Patricia Trie or HAMT
+  // instead of the current AVL-tree-based ImmutableMap.
+  LifetimeLattice join(const LifetimeLattice &Other,
+   LifetimeFactory &Factory) const {
+/// Merge the smaller map into the larger one ensuring we iterate over the
+/// smaller map.
+if (Origins.getHeight() < Other.Origins.getHeight())
+  return Other.join(*this, Factory);
+
+OriginLoanMap JoinedState = Origins;
+// For each origin in the other map, union its loan set with ours.
+for (const auto &Entry : Other.Origins) {
+  OriginID OID = Entry.first;
+  LoanSet OtherLoanSet = Entry.second;
+  JoinedState = Factory.OriginMapFact.add(
+ 

[llvm-branch-commits] [clang] [LifetimeSafety] Implement dataflow analysis for loan propagation (PR #147295)

2025-07-07 Thread Utkarsh Saxena via llvm-branch-commits

https://github.com/usx95 ready_for_review 
https://github.com/llvm/llvm-project/pull/147295
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] Drive profile validator from opt (PR #147418)

2025-07-07 Thread Mircea Trofin via llvm-branch-commits

https://github.com/mtrofin created 
https://github.com/llvm/llvm-project/pull/147418

None

>From 03b555bc71ff35cee5b97a8c9d7883396d4d7f31 Mon Sep 17 00:00:00 2001
From: Mircea Trofin 
Date: Mon, 7 Jul 2025 12:44:41 -0700
Subject: [PATCH] Drive profile validator from opt

---
 llvm/tools/opt/NewPMDriver.cpp | 8 ++--
 llvm/tools/opt/NewPMDriver.h   | 2 +-
 llvm/tools/opt/optdriver.cpp   | 7 ++-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index 7d168a6ceb17c..042ed027639bc 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Debugify.h"
+#include "llvm/Transforms/Utils/ProfileValidation.h"
 
 using namespace llvm;
 using namespace opt_tool;
@@ -356,7 +357,7 @@ bool llvm::runPassPipeline(
 OutputKind OK, VerifierKind VK, bool ShouldPreserveAssemblyUseListOrder,
 bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex,
 bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve,
-bool UnifiedLTO) {
+bool EnableProfcheck, bool UnifiedLTO) {
   auto FS = vfs::getRealFileSystem();
   std::optional P;
   switch (PGOKindFlag) {
@@ -487,7 +488,8 @@ bool llvm::runPassPipeline(
   if (VerifyDIPreserve)
 MPM.addPass(NewPMDebugifyPass(DebugifyMode::OriginalDebugInfo, "",
   &DebugInfoBeforePass));
-
+  if (EnableProfcheck)
+MPM.addPass(createModuleToFunctionPassAdaptor(ProfileInjectorPass()));
   // Add passes according to the -passes options.
   if (!PassPipeline.empty()) {
 if (auto Err = PB.parsePassPipeline(MPM, PassPipeline)) {
@@ -504,6 +506,8 @@ bool llvm::runPassPipeline(
 MPM.addPass(NewPMCheckDebugifyPass(
 false, "", nullptr, DebugifyMode::OriginalDebugInfo,
 &DebugInfoBeforePass, VerifyDIPreserveExport));
+  if (EnableProfcheck)
+MPM.addPass(createModuleToFunctionPassAdaptor(ProfileVerifierPass()));
 
   // Add any relevant output pass at the end of the pipeline.
   switch (OK) {
diff --git a/llvm/tools/opt/NewPMDriver.h b/llvm/tools/opt/NewPMDriver.h
index 2daae571e72c2..6c21d6cae4e75 100644
--- a/llvm/tools/opt/NewPMDriver.h
+++ b/llvm/tools/opt/NewPMDriver.h
@@ -75,7 +75,7 @@ bool runPassPipeline(
 bool ShouldPreserveAssemblyUseListOrder,
 bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex,
 bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve,
-bool UnifiedLTO = false);
+bool EnableProfcheck, bool UnifiedLTO = false);
 } // namespace llvm
 
 #endif
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index de46efa13025d..588110361466d 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -213,6 +213,10 @@ static cl::opt VerifyDebugInfoPreserve(
 cl::desc("Start the pipeline with collecting and end it with checking of "
  "debug info preservation."));
 
+static cl::opt EnableProfileVerification(
+"enable-profcheck", cl::init(true),
+cl::desc("Start the pipeline with prof-inject and end it with 
prof-check"));
+
 static cl::opt ClDataLayout("data-layout",
  cl::desc("data layout string to use"),
  cl::value_desc("layout-string"),
@@ -731,7 +735,8 @@ extern "C" int optMain(
RemarksFile.get(), Pipeline, PluginList, PassBuilderCallbacks,
OK, VK, PreserveAssemblyUseListOrder,
PreserveBitcodeUseListOrder, EmitSummaryIndex, EmitModuleHash,
-   EnableDebugify, VerifyDebugInfoPreserve, UnifiedLTO)
+   EnableDebugify, VerifyDebugInfoPreserve,
+   EnableProfileVerification, UnifiedLTO)
? 0
: 1;
   }

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [mlir] [Flang][MLIR] Add `!$omp unroll` and `omp.unroll_heuristic` (PR #144785)

2025-07-07 Thread Michael Kruse via llvm-branch-commits

https://github.com/Meinersbur edited 
https://github.com/llvm/llvm-project/pull/144785
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [mlir] [Flang][MLIR] Add `!$omp unroll` and `omp.unroll_heuristic` (PR #144785)

2025-07-07 Thread Michael Kruse via llvm-branch-commits

https://github.com/Meinersbur ready_for_review 
https://github.com/llvm/llvm-project/pull/144785
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] Triple: Record default exception handling type (PR #147225)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm closed 
https://github.com/llvm/llvm-project/pull/147225
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [LifetimeSafety] Add script performance benchmarking (PR #147315)

2025-07-07 Thread Utkarsh Saxena via llvm-branch-commits

https://github.com/usx95 edited https://github.com/llvm/llvm-project/pull/147315
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/20.x: [WebAssembly] Fix inline assembly with vector types (#146574) (PR #147409)

2025-07-07 Thread via llvm-branch-commits

https://github.com/llvmbot milestoned 
https://github.com/llvm/llvm-project/pull/147409
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/20.x: [WebAssembly] Fix inline assembly with vector types (#146574) (PR #147409)

2025-07-07 Thread via llvm-branch-commits

llvmbot wrote:

@sunfishcode What do you think about merging this PR to the release branch?

https://github.com/llvm/llvm-project/pull/147409
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/20.x: [WebAssembly] Fix inline assembly with vector types (#146574) (PR #147409)

2025-07-07 Thread Dan Gohman via llvm-branch-commits

https://github.com/sunfishcode approved this pull request.


https://github.com/llvm/llvm-project/pull/147409
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/20.x: [WebAssembly] Fix inline assembly with vector types (#146574) (PR #147409)

2025-07-07 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-webassembly

Author: None (llvmbot)


Changes

Backport a8a9a7f

Requested by: @sunfishcode

---
Full diff: https://github.com/llvm/llvm-project/pull/147409.diff


2 Files Affected:

- (modified) llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td (+2-2) 
- (modified) llvm/test/CodeGen/WebAssembly/inline-asm.ll (+11) 


``diff
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td 
b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 17889dacc868c..31a33c1e7365b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -64,8 +64,8 @@ def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32, 
I32_0)>;
 def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64, I64_0)>;
 def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
 def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
-def V128 : WebAssemblyRegClass<[v8f16, v4f32, v2f64, v2i64, v4i32, v16i8,
-v8i16],
+def V128 : WebAssemblyRegClass<[v2i64, v4i32, v16i8, v8i16,
+v8f16, v4f32, v2f64],
128, (add V128_0)>;
 def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>;
 def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>;
diff --git a/llvm/test/CodeGen/WebAssembly/inline-asm.ll 
b/llvm/test/CodeGen/WebAssembly/inline-asm.ll
index 4462cfb7aa0c4..c378fd953a555 100644
--- a/llvm/test/CodeGen/WebAssembly/inline-asm.ll
+++ b/llvm/test/CodeGen/WebAssembly/inline-asm.ll
@@ -129,7 +129,18 @@ entry:
   ret i32 %ret
 }
 
+; CHECK-LABEL: v128_load
+; CHECK: local.get 0
+; CHECK-NEXT: v128.load 0
+; CHECK-NEXT: local.set 1
+define <4 x i32> @v128_load(ptr %v) #1 {
+entry:
+  %0 = tail call <4 x i32> asm "local.get $1\0Av128.load 0\0Alocal.set $0", 
"=r,r"(ptr %v)
+  ret <4 x i32> %0
+}
+
 attributes #0 = { nounwind }
+attributes #1 = { "target-features"="+simd128" }
 
 !0 = !{i32 47}
 !1 = !{i32 145}

``




https://github.com/llvm/llvm-project/pull/147409
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/20.x: [WebAssembly] Fix inline assembly with vector types (#146574) (PR #147409)

2025-07-07 Thread via llvm-branch-commits

https://github.com/llvmbot created 
https://github.com/llvm/llvm-project/pull/147409

Backport a8a9a7f

Requested by: @sunfishcode

>From 9fd5816e48736cc51a118311e805d3e1f3758092 Mon Sep 17 00:00:00 2001
From: Alex Crichton 
Date: Wed, 2 Jul 2025 05:26:30 +0200
Subject: [PATCH] [WebAssembly] Fix inline assembly with vector types (#146574)

This commit fixes using inline assembly with v128 results. Previously
this failed with an internal assertion about a failure to legalize a
`CopyFromReg` where the source register was typed `v8f16`. It looks like
the type used for the destination register was whatever was listed first
in the `def V128 : WebAssemblyRegClass` listing, so the types were
shuffled around to have a default-supported type.

A small test was added as well which failed to generate previously and
should now pass in generation. This test passed on LLVM 18 additionally
and regressed by accident in #93228 which was first included in LLVM 19.

(cherry picked from commit a8a9a7f95a695c02bdf3d5821d1c62cc8e08c2ff)
---
 .../lib/Target/WebAssembly/WebAssemblyRegisterInfo.td |  4 ++--
 llvm/test/CodeGen/WebAssembly/inline-asm.ll   | 11 +++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td 
b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 17889dacc868c..31a33c1e7365b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -64,8 +64,8 @@ def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32, 
I32_0)>;
 def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64, I64_0)>;
 def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
 def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
-def V128 : WebAssemblyRegClass<[v8f16, v4f32, v2f64, v2i64, v4i32, v16i8,
-v8i16],
+def V128 : WebAssemblyRegClass<[v2i64, v4i32, v16i8, v8i16,
+v8f16, v4f32, v2f64],
128, (add V128_0)>;
 def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>;
 def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>;
diff --git a/llvm/test/CodeGen/WebAssembly/inline-asm.ll 
b/llvm/test/CodeGen/WebAssembly/inline-asm.ll
index 4462cfb7aa0c4..c378fd953a555 100644
--- a/llvm/test/CodeGen/WebAssembly/inline-asm.ll
+++ b/llvm/test/CodeGen/WebAssembly/inline-asm.ll
@@ -129,7 +129,18 @@ entry:
   ret i32 %ret
 }
 
+; CHECK-LABEL: v128_load
+; CHECK: local.get 0
+; CHECK-NEXT: v128.load 0
+; CHECK-NEXT: local.set 1
+define <4 x i32> @v128_load(ptr %v) #1 {
+entry:
+  %0 = tail call <4 x i32> asm "local.get $1\0Av128.load 0\0Alocal.set $0", 
"=r,r"(ptr %v)
+  ret <4 x i32> %0
+}
+
 attributes #0 = { nounwind }
+attributes #1 = { "target-features"="+simd128" }
 
 !0 = !{i32 47}
 !1 = !{i32 145}

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] Users/usx95/lifetime safety benchmarking (PR #147315)

2025-07-07 Thread Utkarsh Saxena via llvm-branch-commits

https://github.com/usx95 edited https://github.com/llvm/llvm-project/pull/147315
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [Offload] Allow "tagging" device info entries with offload keys (PR #147317)

2025-07-07 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Ross Brunton (RossBrunton)


Changes

When generating the device info tree, nodes can be marked with an
offload Device Info value. The nodes can also look up children based
on this value.


---
Full diff: https://github.com/llvm/llvm-project/pull/147317.diff


3 Files Affected:

- (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+7-4) 
- (modified) offload/plugins-nextgen/common/include/PluginInterface.h (+24-3) 
- (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+5-3) 


``diff
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp 
b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 832c31c43b5d2..52ea3283b24ef 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2562,7 +2562,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, 
AMDGenericDeviceTy {
 Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
 if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
   Info.add("HSA Runtime Version",
-   std::to_string(Major) + "." + std::to_string(Minor));
+   std::to_string(Major) + "." + std::to_string(Minor), "",
+   DeviceInfo::DRIVER_VERSION);
 
 Info.add("HSA OpenMP Device Number", DeviceId);
 
@@ -2572,11 +2573,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, 
AMDGenericDeviceTy {
 
 Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
 if (Status == HSA_STATUS_SUCCESS)
-  Info.add("Device Name", TmpChar);
+  Info.add("Device Name", TmpChar, "", DeviceInfo::NAME);
 
 Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
 if (Status == HSA_STATUS_SUCCESS)
-  Info.add("Vendor Name", TmpChar);
+  Info.add("Vendor Name", TmpChar, "", DeviceInfo::VENDOR);
 
 hsa_device_type_t DevType;
 Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
@@ -2652,7 +2653,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, 
AMDGenericDeviceTy {
 
 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
 if (Status == HSA_STATUS_SUCCESS) {
-  auto &MaxSize = *Info.add("Workgroup Max Size per Dimension");
+  auto &MaxSize =
+  *Info.add("Workgroup Max Size per Dimension", std::monostate{}, "",
+DeviceInfo::MAX_WORK_GROUP_SIZE);
   MaxSize.add("x", WorkgrpMaxDim[0]);
   MaxSize.add("y", WorkgrpMaxDim[1]);
   MaxSize.add("z", WorkgrpMaxDim[2]);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h 
b/offload/plugins-nextgen/common/include/PluginInterface.h
index b5addc13d6644..9dc01ca0277fe 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -113,6 +113,12 @@ struct AsyncInfoWrapperTy {
   __tgt_async_info *AsyncInfoPtr;
 };
 
+enum class DeviceInfo {
+#define OFFLOAD_DEVINFO(Name, _, Value) Name = Value,
+#include "OffloadInfo.inc"
+#undef OFFLOAD_DEVINFO
+};
+
 /// Tree node for device information
 ///
 /// This information is either printed or used by liboffload to extract certain
@@ -133,6 +139,8 @@ struct InfoTreeNode {
   // * The same key can appear multiple times
   std::unique_ptr> Children;
 
+  std::map DeviceInfoMap;
+
   InfoTreeNode() : InfoTreeNode("", std::monostate{}, "") {}
   InfoTreeNode(std::string Key, VariantType Value, std::string Units)
   : Key(Key), Value(Value), Units(Units) {}
@@ -140,10 +148,12 @@ struct InfoTreeNode {
   /// Add a new info entry as a child of this node. The entry requires at least
   /// a key string in \p Key. The value in \p Value is optional and can be any
   /// type that is representable as a string. The units in \p Units is optional
-  /// and must be a string.
+  /// and must be a string. Providing a device info key allows liboffload to
+  /// use that value for an appropriate olGetDeviceInfo query
   template 
   InfoTreeNode *add(std::string Key, T Value = T(),
-const std::string &Units = std::string()) {
+const std::string &Units = std::string(),
+std::optional DeviceInfoKey = std::nullopt) {
 assert(!Key.empty() && "Invalid info key");
 
 if (!Children)
@@ -157,7 +167,12 @@ struct InfoTreeNode {
 else
   ValueVariant = std::string{Value};
 
-return &Children->emplace_back(Key, ValueVariant, Units);
+auto Ptr = &Children->emplace_back(Key, ValueVariant, Units);
+
+if (DeviceInfoKey)
+  DeviceInfoMap[*DeviceInfoKey] = Children->size() - 1;
+
+return Ptr;
   }
 
   std::optional get(StringRef Key) {
@@ -171,6 +186,12 @@ struct InfoTreeNode {
 return It;
   }
 
+  std::optional get(DeviceInfo Info) {
+if (DeviceInfoMap.count(Info))
+  return &(*Children)[DeviceInfoMap[Info]];
+return std::nullopt;
+  }
+
   /// Print all info entries in the tree
   void print() const {
 // Fake an additional indent so that 

[llvm-branch-commits] [clang] [LifetimeSafety] Add script performance benchmarking (PR #147315)

2025-07-07 Thread via llvm-branch-commits

github-actions[bot] wrote:




:warning: Python code formatter, darker found issues in your code. :warning:



You can test this locally with the following command:


``bash
darker --check --diff -r HEAD~1...HEAD 
clang/test/Analysis/lifetime_safety/benchmark.py
``





View the diff from darker here.


``diff
--- benchmark.py2025-07-07 15:13:00.00 +
+++ benchmark.py2025-07-07 15:15:18.715309 +
@@ -7,10 +7,11 @@
 from datetime import datetime
 import numpy as np
 from scipy.optimize import curve_fit
 from scipy.stats import t
 
+
 def generate_cpp_cycle_test(n: int) -> str:
 """
 Generates a C++ code snippet with a specified number of pointers in a 
cycle.
 """
 if n <= 0:
@@ -32,10 +33,11 @@
 cpp_code += f"p{n} = temp;\n"
 cpp_code += "  }\n}\n"
 cpp_code += f"\nint main() {{ long_cycle_{n}(false); return 0; }}\n"
 return cpp_code
 
+
 def generate_cpp_merge_test(n: int) -> str:
 """
 Generates a C++ code snippet with N independent conditional assignments.
 """
 if n <= 0:
@@ -53,163 +55,188 @@
 
 cpp_code += "}\n"
 cpp_code += f"\nint main() {{ conditional_merges_{n}(false); return 0; 
}}\n"
 return cpp_code
 
+
 def analyze_trace_file(trace_path: str) -> tuple[float, float]:
 """
 Parses the -ftime-trace JSON output to find durations.
 
 Returns:
 A tuple of (lifetime_analysis_duration_us, total_clang_duration_us).
 """
 lifetime_duration = 0.0
 total_duration = 0.0
 try:
-with open(trace_path, 'r') as f:
+with open(trace_path, "r") as f:
 trace_data = json.load(f)
-for event in trace_data.get('traceEvents', []):
-if event.get('name') == 'LifetimeAnalysis':
-lifetime_duration += float(event.get('dur', 0))
-if event.get('name') == 'ExecuteCompiler':
-total_duration += float(event.get('dur', 0))
+for event in trace_data.get("traceEvents", []):
+if event.get("name") == "LifetimeAnalysis":
+lifetime_duration += float(event.get("dur", 0))
+if event.get("name") == "ExecuteCompiler":
+total_duration += float(event.get("dur", 0))
 
 except (IOError, json.JSONDecodeError) as e:
 print(f"Error reading or parsing trace file {trace_path}: {e}", 
file=sys.stderr)
 return 0.0, 0.0
 return lifetime_duration, total_duration
 
+
 def power_law(n, c, k):
 """Represents the power law function: y = c * n^k"""
 return c * np.power(n, k)
+
 
 def human_readable_time(ms: float) -> str:
 """Converts milliseconds to a human-readable string (ms or s)."""
 if ms >= 1000:
 return f"{ms / 1000:.2f} s"
 return f"{ms:.2f} ms"
+
 
 def generate_markdown_report(results: dict) -> str:
 """Generates a Markdown-formatted report from the benchmark results."""
 report = []
 timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
 report.append(f"# Lifetime Analysis Performance Report")
 report.append(f"> Generated on: {timestamp}")
 report.append("\n---\n")
 
 for test_type, data in results.items():
-title = 'Pointer Cycle in Loop' if test_type == 'cycle' else 'CFG 
Merges'
+title = "Pointer Cycle in Loop" if test_type == "cycle" else "CFG 
Merges"
 report.append(f"## Test Case: {title}")
 report.append("")
 
 # Table header
 report.append("| N   | Analysis Time | Total Clang Time |")
 report.append("|:|--:|-:|")
 
 # Table rows
-n_data = np.array(data['n'])
-analysis_data = np.array(data['lifetime_ms'])
-total_data = np.array(data['total_ms'])
+n_data = np.array(data["n"])
+analysis_data = np.array(data["lifetime_ms"])
+total_data = np.array(data["total_ms"])
 for i in range(len(n_data)):
 analysis_str = human_readable_time(analysis_data[i])
 total_str = human_readable_time(total_data[i])
 report.append(f"| {n_data[i]:<3} | {analysis_str:>13} | 
{total_str:>16} |")
 
 report.append("")
 
 # Complexity analysis
 report.append(f"**Complexity Analysis:**")
 try:
-popt, pcov = curve_fit(power_law, n_data, analysis_data, p0=[0, 
2], maxfev=5000)
+popt, pcov = curve_fit(
+power_law, n_data, analysis_data, p0=[0, 2], maxfev=5000
+)
 _, k = popt
-
+
 # R-squared calculation
 residuals = analysis_data - power_law(n_data, *popt)
 ss_res = np.sum(residuals**2)
-ss_tot = np.sum((analysis_data - np.mean(analysis_data))**2)
+ss_tot = np.sum((analysis_data - np.mean(analysis_data)) ** 2)
 r_squared = 1 - (ss_res / ss_tot)
-
+
 # Confidence I

[llvm-branch-commits] [llvm] [Offload] Refactor device information queries to use new tagging (PR #147318)

2025-07-07 Thread Ross Brunton via llvm-branch-commits

https://github.com/RossBrunton created 
https://github.com/llvm/llvm-project/pull/147318

Instead using strings to look up device information (which is brittle
and slow), use the new tags that the plugins specify when building the
nodes.


>From 4cce1eec173637a0e50655e10ad520a9821b9960 Mon Sep 17 00:00:00 2001
From: Ross Brunton 
Date: Mon, 7 Jul 2025 16:13:32 +0100
Subject: [PATCH] [Offload] Refactor device information queries to use new
 tagging

Instead using strings to look up device information (which is brittle
and slow), use the new tags that the plugins specify when building the
nodes.
---
 offload/liboffload/src/Helpers.hpp |  19 ++---
 offload/liboffload/src/OffloadImpl.cpp | 111 +++--
 2 files changed, 54 insertions(+), 76 deletions(-)

diff --git a/offload/liboffload/src/Helpers.hpp 
b/offload/liboffload/src/Helpers.hpp
index 8b85945508b98..62e55e500fac7 100644
--- a/offload/liboffload/src/Helpers.hpp
+++ b/offload/liboffload/src/Helpers.hpp
@@ -75,23 +75,16 @@ class InfoWriter {
   InfoWriter(InfoWriter &) = delete;
   ~InfoWriter() = default;
 
-  template  llvm::Error write(llvm::Expected &&Val) {
-if (Val)
-  return getInfo(Size, Target, SizeRet, *Val);
-return Val.takeError();
+  template  llvm::Error write(T Val) {
+return getInfo(Size, Target, SizeRet, Val);
   }
 
-  template 
-  llvm::Error writeArray(llvm::Expected &&Val, size_t Elems) {
-if (Val)
-  return getInfoArray(Elems, Size, Target, SizeRet, *Val);
-return Val.takeError();
+  template  llvm::Error writeArray(T Val, size_t Elems) {
+return getInfoArray(Elems, Size, Target, SizeRet, Val);
   }
 
-  llvm::Error writeString(llvm::Expected &&Val) {
-if (Val)
-  return getInfoString(Size, Target, SizeRet, *Val);
-return Val.takeError();
+  llvm::Error writeString(llvm::StringRef Val) {
+return getInfoString(Size, Target, SizeRet, Val);
   }
 
 private:
diff --git a/offload/liboffload/src/OffloadImpl.cpp 
b/offload/liboffload/src/OffloadImpl.cpp
index f9da638436705..c84bf01460252 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -286,78 +286,63 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
 return Plugin::error(ErrorCode::UNIMPLEMENTED, ErrBuffer.c_str());
   };
 
-  // Find the info if it exists under any of the given names
-  auto getInfoString =
-  [&](std::vector Names) -> llvm::Expected {
-for (auto &Name : Names) {
-  if (auto Entry = Device->Info.get(Name)) {
-if (!std::holds_alternative((*Entry)->Value))
-  return makeError(ErrorCode::BACKEND_FAILURE,
-   "plugin returned incorrect type");
-return std::get((*Entry)->Value).c_str();
-  }
-}
-
-return makeError(ErrorCode::UNIMPLEMENTED,
- "plugin did not provide a response for this information");
-  };
-
-  auto getInfoXyz =
-  [&](std::vector Names) -> llvm::Expected {
-for (auto &Name : Names) {
-  if (auto Entry = Device->Info.get(Name)) {
-auto Node = *Entry;
-ol_dimensions_t Out{0, 0, 0};
-
-auto getField = [&](StringRef Name, uint32_t &Dest) {
-  if (auto F = Node->get(Name)) {
-if (!std::holds_alternative((*F)->Value))
-  return makeError(
-  ErrorCode::BACKEND_FAILURE,
-  "plugin returned incorrect type for dimensions element");
-Dest = std::get((*F)->Value);
-  } else
-return makeError(ErrorCode::BACKEND_FAILURE,
- "plugin didn't provide all values for 
dimensions");
-  return Plugin::success();
-};
-
-if (auto Res = getField("x", Out.x))
-  return Res;
-if (auto Res = getField("y", Out.y))
-  return Res;
-if (auto Res = getField("z", Out.z))
-  return Res;
-
-return Out;
-  }
-}
+  // These are not implemented by the plugin interface
+  if (PropName == OL_DEVICE_INFO_PLATFORM)
+return Info.write(Device->Platform);
+  if (PropName == OL_DEVICE_INFO_TYPE)
+return Info.write(OL_DEVICE_TYPE_GPU);
+  // TODO: Update when https://github.com/llvm/llvm-project/pull/147314 is 
merged
+  if (PropName > OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE)
+return createOffloadError(ErrorCode::INVALID_ENUMERATION,
+  "getDeviceInfo enum '%i' is invalid", PropName);
 
+  auto EntryOpt = Device->Info.get(static_cast(PropName));
+  if (!EntryOpt)
 return makeError(ErrorCode::UNIMPLEMENTED,
  "plugin did not provide a response for this information");
-  };
+  auto Entry = *EntryOpt;
 
   switch (PropName) {
-  case OL_DEVICE_INFO_PLATFORM:
-return Info.write(Device->Platform);
-  case OL_DEVICE_INFO_TYPE:
-return Info.write(OL_DEVICE_TYPE_GPU);
   case OL_DEVICE_INFO_NAME:
-return Info.writeString(getInfoString({"Device Name"}));
   case OL_DEVICE_INFO_VENDOR:

[llvm-branch-commits] [llvm] [Offload] Allow "tagging" device info entries with offload keys (PR #147317)

2025-07-07 Thread Joseph Huber via llvm-branch-commits


@@ -133,17 +139,21 @@ struct InfoTreeNode {
   // * The same key can appear multiple times
   std::unique_ptr> Children;
 
+  std::map DeviceInfoMap;

jhuber6 wrote:

Do these need to be sorted? Otherwise a dense map is more efficient.

https://github.com/llvm/llvm-project/pull/147317
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [Offload] Allow "tagging" device info entries with offload keys (PR #147317)

2025-07-07 Thread Joseph Huber via llvm-branch-commits


@@ -171,6 +186,12 @@ struct InfoTreeNode {
 return It;
   }
 
+  std::optional get(DeviceInfo Info) {
+if (DeviceInfoMap.count(Info))
+  return &(*Children)[DeviceInfoMap[Info]];
+return std::nullopt;

jhuber6 wrote:

```suggestion
return !DeviceInfoMap.count(Info) std::nullopt : 
&(*Children)[DeviceInfoMap[Info]];
```

https://github.com/llvm/llvm-project/pull/147317
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [Offload] Refactor device information queries to use new tagging (PR #147318)

2025-07-07 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-offload

Author: Ross Brunton (RossBrunton)


Changes

Instead using strings to look up device information (which is brittle
and slow), use the new tags that the plugins specify when building the
nodes.


---
Full diff: https://github.com/llvm/llvm-project/pull/147318.diff


2 Files Affected:

- (modified) offload/liboffload/src/Helpers.hpp (+6-13) 
- (modified) offload/liboffload/src/OffloadImpl.cpp (+48-63) 


``diff
diff --git a/offload/liboffload/src/Helpers.hpp 
b/offload/liboffload/src/Helpers.hpp
index 8b85945508b98..62e55e500fac7 100644
--- a/offload/liboffload/src/Helpers.hpp
+++ b/offload/liboffload/src/Helpers.hpp
@@ -75,23 +75,16 @@ class InfoWriter {
   InfoWriter(InfoWriter &) = delete;
   ~InfoWriter() = default;
 
-  template  llvm::Error write(llvm::Expected &&Val) {
-if (Val)
-  return getInfo(Size, Target, SizeRet, *Val);
-return Val.takeError();
+  template  llvm::Error write(T Val) {
+return getInfo(Size, Target, SizeRet, Val);
   }
 
-  template 
-  llvm::Error writeArray(llvm::Expected &&Val, size_t Elems) {
-if (Val)
-  return getInfoArray(Elems, Size, Target, SizeRet, *Val);
-return Val.takeError();
+  template  llvm::Error writeArray(T Val, size_t Elems) {
+return getInfoArray(Elems, Size, Target, SizeRet, Val);
   }
 
-  llvm::Error writeString(llvm::Expected &&Val) {
-if (Val)
-  return getInfoString(Size, Target, SizeRet, *Val);
-return Val.takeError();
+  llvm::Error writeString(llvm::StringRef Val) {
+return getInfoString(Size, Target, SizeRet, Val);
   }
 
 private:
diff --git a/offload/liboffload/src/OffloadImpl.cpp 
b/offload/liboffload/src/OffloadImpl.cpp
index f9da638436705..c84bf01460252 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -286,78 +286,63 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
 return Plugin::error(ErrorCode::UNIMPLEMENTED, ErrBuffer.c_str());
   };
 
-  // Find the info if it exists under any of the given names
-  auto getInfoString =
-  [&](std::vector Names) -> llvm::Expected {
-for (auto &Name : Names) {
-  if (auto Entry = Device->Info.get(Name)) {
-if (!std::holds_alternative((*Entry)->Value))
-  return makeError(ErrorCode::BACKEND_FAILURE,
-   "plugin returned incorrect type");
-return std::get((*Entry)->Value).c_str();
-  }
-}
-
-return makeError(ErrorCode::UNIMPLEMENTED,
- "plugin did not provide a response for this information");
-  };
-
-  auto getInfoXyz =
-  [&](std::vector Names) -> llvm::Expected {
-for (auto &Name : Names) {
-  if (auto Entry = Device->Info.get(Name)) {
-auto Node = *Entry;
-ol_dimensions_t Out{0, 0, 0};
-
-auto getField = [&](StringRef Name, uint32_t &Dest) {
-  if (auto F = Node->get(Name)) {
-if (!std::holds_alternative((*F)->Value))
-  return makeError(
-  ErrorCode::BACKEND_FAILURE,
-  "plugin returned incorrect type for dimensions element");
-Dest = std::get((*F)->Value);
-  } else
-return makeError(ErrorCode::BACKEND_FAILURE,
- "plugin didn't provide all values for 
dimensions");
-  return Plugin::success();
-};
-
-if (auto Res = getField("x", Out.x))
-  return Res;
-if (auto Res = getField("y", Out.y))
-  return Res;
-if (auto Res = getField("z", Out.z))
-  return Res;
-
-return Out;
-  }
-}
+  // These are not implemented by the plugin interface
+  if (PropName == OL_DEVICE_INFO_PLATFORM)
+return Info.write(Device->Platform);
+  if (PropName == OL_DEVICE_INFO_TYPE)
+return Info.write(OL_DEVICE_TYPE_GPU);
+  // TODO: Update when https://github.com/llvm/llvm-project/pull/147314 is 
merged
+  if (PropName > OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE)
+return createOffloadError(ErrorCode::INVALID_ENUMERATION,
+  "getDeviceInfo enum '%i' is invalid", PropName);
 
+  auto EntryOpt = Device->Info.get(static_cast(PropName));
+  if (!EntryOpt)
 return makeError(ErrorCode::UNIMPLEMENTED,
  "plugin did not provide a response for this information");
-  };
+  auto Entry = *EntryOpt;
 
   switch (PropName) {
-  case OL_DEVICE_INFO_PLATFORM:
-return Info.write(Device->Platform);
-  case OL_DEVICE_INFO_TYPE:
-return Info.write(OL_DEVICE_TYPE_GPU);
   case OL_DEVICE_INFO_NAME:
-return Info.writeString(getInfoString({"Device Name"}));
   case OL_DEVICE_INFO_VENDOR:
-return Info.writeString(getInfoString({"Vendor Name"}));
-  case OL_DEVICE_INFO_DRIVER_VERSION:
-return Info.writeString(
-getInfoString({"CUDA Driver Version", "HSA Runtime Version"}));
-  case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE:
-return Info.write(getInfoXyz({"Workgroup Max Size per Dim

[llvm-branch-commits] [llvm] [Offload] Allow "tagging" device info entries with offload keys (PR #147317)

2025-07-07 Thread Matt Arsenault via llvm-branch-commits


@@ -171,6 +186,12 @@ struct InfoTreeNode {
 return It;
   }
 
+  std::optional get(DeviceInfo Info) {
+if (DeviceInfoMap.count(Info))
+  return &(*Children)[DeviceInfoMap[Info]];
+return std::nullopt;

arsenm wrote:

This is still a double map lookup, do one find 

https://github.com/llvm/llvm-project/pull/147317
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [Offload] Refactor device information queries to use new tagging (PR #147318)

2025-07-07 Thread via llvm-branch-commits

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:



You can test this locally with the following command:


``bash
git-clang-format --diff HEAD~1 HEAD --extensions hpp,cpp -- 
offload/liboffload/src/Helpers.hpp offload/liboffload/src/OffloadImpl.cpp
``





View the diff from clang-format here.


``diff
diff --git a/offload/liboffload/src/OffloadImpl.cpp 
b/offload/liboffload/src/OffloadImpl.cpp
index c84bf0146..4ca32d2e0 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -291,7 +291,8 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
 return Info.write(Device->Platform);
   if (PropName == OL_DEVICE_INFO_TYPE)
 return Info.write(OL_DEVICE_TYPE_GPU);
-  // TODO: Update when https://github.com/llvm/llvm-project/pull/147314 is 
merged
+  // TODO: Update when https://github.com/llvm/llvm-project/pull/147314 is
+  // merged
   if (PropName > OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE)
 return createOffloadError(ErrorCode::INVALID_ENUMERATION,
   "getDeviceInfo enum '%i' is invalid", PropName);

``




https://github.com/llvm/llvm-project/pull/147318
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [LifetimeSafety] Add script performance benchmarking (PR #147315)

2025-07-07 Thread Utkarsh Saxena via llvm-branch-commits

https://github.com/usx95 updated 
https://github.com/llvm/llvm-project/pull/147315

>From 0fbfd74d23b6cd26ef0480f7b9061b2f4a745338 Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena 
Date: Mon, 7 Jul 2025 15:13:00 +
Subject: [PATCH 1/2] [LifetimeSafety] Add script performance benchmarking

---
 clang/lib/Analysis/LifetimeSafety.cpp |   7 +-
 .../Analysis/lifetime_safety/benchmark.py | 215 ++
 2 files changed, 221 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Analysis/lifetime_safety/benchmark.py

diff --git a/clang/lib/Analysis/LifetimeSafety.cpp 
b/clang/lib/Analysis/LifetimeSafety.cpp
index e881e592ef59f..1c83b5051bad1 100644
--- a/clang/lib/Analysis/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety.cpp
@@ -151,7 +151,12 @@ class OriginManager {
 
   OriginID get(const ValueDecl &D) {
 auto It = DeclToOriginID.find(&D);
-assert(It != DeclToOriginID.end());
+// TODO: This should be an assert(It != ExprToOriginID.end()). The current
+// implementation falls back to getOrCreate to avoid crashing on
+// yet-unhandled pointer expressions, creating an empty origin for them.
+if (It == DeclToOriginID.end())
+  return getOrCreate(D);
+
 return It->second;
   }
 
diff --git a/clang/test/Analysis/lifetime_safety/benchmark.py 
b/clang/test/Analysis/lifetime_safety/benchmark.py
new file mode 100644
index 0..ddf32e192de17
--- /dev/null
+++ b/clang/test/Analysis/lifetime_safety/benchmark.py
@@ -0,0 +1,215 @@
+import sys
+import argparse
+import subprocess
+import tempfile
+import json
+import os
+from datetime import datetime
+import numpy as np
+from scipy.optimize import curve_fit
+from scipy.stats import t
+
+def generate_cpp_cycle_test(n: int) -> str:
+"""
+Generates a C++ code snippet with a specified number of pointers in a 
cycle.
+"""
+if n <= 0:
+return "// Number of variables must be positive."
+
+cpp_code = "struct MyObj { int id; ~MyObj() {} };\n\n"
+cpp_code += f"void long_cycle_{n}(bool condition) {{\n"
+for i in range(1, n + 1):
+cpp_code += f"  MyObj v{i}{{1}};\n"
+cpp_code += "\n"
+for i in range(1, n + 1):
+cpp_code += f"  MyObj* p{i} = &v{i};\n"
+
+cpp_code += "\n  while (condition) {\n"
+if n > 0:
+cpp_code += f"MyObj* temp = p1;\n"
+for i in range(1, n):
+cpp_code += f"p{i} = p{i+1};\n"
+cpp_code += f"p{n} = temp;\n"
+cpp_code += "  }\n}\n"
+cpp_code += f"\nint main() {{ long_cycle_{n}(false); return 0; }}\n"
+return cpp_code
+
+def generate_cpp_merge_test(n: int) -> str:
+"""
+Generates a C++ code snippet with N independent conditional assignments.
+"""
+if n <= 0:
+return "// Number of variables must be positive."
+
+cpp_code = "struct MyObj { int id; ~MyObj() {} };\n\n"
+cpp_code += f"void conditional_merges_{n}(bool condition) {{\n"
+decls = [f"v{i}" for i in range(1, n + 1)]
+cpp_code += f"  MyObj {', '.join(decls)};\n"
+ptr_decls = [f"*p{i} = nullptr" for i in range(1, n + 1)]
+cpp_code += f"  MyObj {', '.join(ptr_decls)};\n\n"
+
+for i in range(1, n + 1):
+cpp_code += f"  if(condition) {{ p{i} = &v{i}; }}\n"
+
+cpp_code += "}\n"
+cpp_code += f"\nint main() {{ conditional_merges_{n}(false); return 0; 
}}\n"
+return cpp_code
+
+def analyze_trace_file(trace_path: str) -> tuple[float, float]:
+"""
+Parses the -ftime-trace JSON output to find durations.
+
+Returns:
+A tuple of (lifetime_analysis_duration_us, total_clang_duration_us).
+"""
+lifetime_duration = 0.0
+total_duration = 0.0
+try:
+with open(trace_path, 'r') as f:
+trace_data = json.load(f)
+for event in trace_data.get('traceEvents', []):
+if event.get('name') == 'LifetimeAnalysis':
+lifetime_duration += float(event.get('dur', 0))
+if event.get('name') == 'ExecuteCompiler':
+total_duration += float(event.get('dur', 0))
+
+except (IOError, json.JSONDecodeError) as e:
+print(f"Error reading or parsing trace file {trace_path}: {e}", 
file=sys.stderr)
+return 0.0, 0.0
+return lifetime_duration, total_duration
+
+def power_law(n, c, k):
+"""Represents the power law function: y = c * n^k"""
+return c * np.power(n, k)
+
+def human_readable_time(ms: float) -> str:
+"""Converts milliseconds to a human-readable string (ms or s)."""
+if ms >= 1000:
+return f"{ms / 1000:.2f} s"
+return f"{ms:.2f} ms"
+
+def generate_markdown_report(results: dict) -> str:
+"""Generates a Markdown-formatted report from the benchmark results."""
+report = []
+timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
+report.append(f"# Lifetime Analysis Performance Report")
+report.append(f"> Generated on: {timestamp}")
+report.append("\n---\n")
+
+for test_typ

[llvm-branch-commits] [clang] [LifetimeSafety] Add script for performance benchmarking (PR #147315)

2025-07-07 Thread Utkarsh Saxena via llvm-branch-commits

https://github.com/usx95 edited https://github.com/llvm/llvm-project/pull/147315
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [Offload] Allow "tagging" device info entries with offload keys (PR #147317)

2025-07-07 Thread Ross Brunton via llvm-branch-commits

https://github.com/RossBrunton created 
https://github.com/llvm/llvm-project/pull/147317

When generating the device info tree, nodes can be marked with an
offload Device Info value. The nodes can also look up children based
on this value.


>From 9b79557e7a536ccd4b02365c9dd98a4ef69f87e1 Mon Sep 17 00:00:00 2001
From: Ross Brunton 
Date: Mon, 7 Jul 2025 16:10:19 +0100
Subject: [PATCH] [Offload] Allow "tagging" device info entries with offload
 keys

When generating the device info tree, nodes can be marked with an
offload Device Info value. The nodes can also look up children based
on this value.
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp| 11 +---
 .../common/include/PluginInterface.h  | 27 ---
 offload/plugins-nextgen/cuda/src/rtl.cpp  |  8 +++---
 3 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp 
b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 832c31c43b5d2..52ea3283b24ef 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2562,7 +2562,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, 
AMDGenericDeviceTy {
 Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
 if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
   Info.add("HSA Runtime Version",
-   std::to_string(Major) + "." + std::to_string(Minor));
+   std::to_string(Major) + "." + std::to_string(Minor), "",
+   DeviceInfo::DRIVER_VERSION);
 
 Info.add("HSA OpenMP Device Number", DeviceId);
 
@@ -2572,11 +2573,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, 
AMDGenericDeviceTy {
 
 Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
 if (Status == HSA_STATUS_SUCCESS)
-  Info.add("Device Name", TmpChar);
+  Info.add("Device Name", TmpChar, "", DeviceInfo::NAME);
 
 Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
 if (Status == HSA_STATUS_SUCCESS)
-  Info.add("Vendor Name", TmpChar);
+  Info.add("Vendor Name", TmpChar, "", DeviceInfo::VENDOR);
 
 hsa_device_type_t DevType;
 Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
@@ -2652,7 +2653,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, 
AMDGenericDeviceTy {
 
 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
 if (Status == HSA_STATUS_SUCCESS) {
-  auto &MaxSize = *Info.add("Workgroup Max Size per Dimension");
+  auto &MaxSize =
+  *Info.add("Workgroup Max Size per Dimension", std::monostate{}, "",
+DeviceInfo::MAX_WORK_GROUP_SIZE);
   MaxSize.add("x", WorkgrpMaxDim[0]);
   MaxSize.add("y", WorkgrpMaxDim[1]);
   MaxSize.add("z", WorkgrpMaxDim[2]);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h 
b/offload/plugins-nextgen/common/include/PluginInterface.h
index b5addc13d6644..9dc01ca0277fe 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -113,6 +113,12 @@ struct AsyncInfoWrapperTy {
   __tgt_async_info *AsyncInfoPtr;
 };
 
+enum class DeviceInfo {
+#define OFFLOAD_DEVINFO(Name, _, Value) Name = Value,
+#include "OffloadInfo.inc"
+#undef OFFLOAD_DEVINFO
+};
+
 /// Tree node for device information
 ///
 /// This information is either printed or used by liboffload to extract certain
@@ -133,6 +139,8 @@ struct InfoTreeNode {
   // * The same key can appear multiple times
   std::unique_ptr> Children;
 
+  std::map DeviceInfoMap;
+
   InfoTreeNode() : InfoTreeNode("", std::monostate{}, "") {}
   InfoTreeNode(std::string Key, VariantType Value, std::string Units)
   : Key(Key), Value(Value), Units(Units) {}
@@ -140,10 +148,12 @@ struct InfoTreeNode {
   /// Add a new info entry as a child of this node. The entry requires at least
   /// a key string in \p Key. The value in \p Value is optional and can be any
   /// type that is representable as a string. The units in \p Units is optional
-  /// and must be a string.
+  /// and must be a string. Providing a device info key allows liboffload to
+  /// use that value for an appropriate olGetDeviceInfo query
   template 
   InfoTreeNode *add(std::string Key, T Value = T(),
-const std::string &Units = std::string()) {
+const std::string &Units = std::string(),
+std::optional DeviceInfoKey = std::nullopt) {
 assert(!Key.empty() && "Invalid info key");
 
 if (!Children)
@@ -157,7 +167,12 @@ struct InfoTreeNode {
 else
   ValueVariant = std::string{Value};
 
-return &Children->emplace_back(Key, ValueVariant, Units);
+auto Ptr = &Children->emplace_back(Key, ValueVariant, Units);
+
+if (DeviceInfoKey)
+  DeviceInfoMap[*DeviceInfoKey] = Children->size() - 1;
+
+return Ptr;
   }
 
   std::optional get(StringRef Key) {
@@ -171,6 +186,12 @@ struct InfoTreeNode {
  

[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)

2025-07-07 Thread Sander de Smalen via llvm-branch-commits


@@ -784,8 +785,8 @@ AArch64RegisterInfo::useFPForScavengingIndex(const 
MachineFunction &MF) const {
   assert((!MF.getSubtarget().hasSVE() ||
   AFI->hasCalculatedStackSizeSVE()) &&
  "Expected SVE area to be calculated by this point");
-  return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeSVE() 
&&
- !AFI->hasStackHazardSlotIndex();
+  return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeZPR() 
&&
+ !AFI->getStackSizePPR() && !AFI->hasStackHazardSlotIndex();

sdesmalen-arm wrote:

nit:
```suggestion
  return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->hasSVEStackSize() &&
 !AFI->hasStackHazardSlotIndex();
```

https://github.com/llvm/llvm-project/pull/142391
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)

2025-07-07 Thread Sander de Smalen via llvm-branch-commits


@@ -299,14 +297,20 @@ class AArch64FunctionInfo final : public 
MachineFunctionInfo {
 TailCallReservedStack = bytes;
   }
 
-  bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
+  void setStackSizeZPR(uint64_t S) {
+HasCalculatedStackSizeSVE = true;

sdesmalen-arm wrote:

nit: this function sets `HasCalculatedStackSizeSVE` if only one of the two 
values are set. Is it worth making this `setStackSizeSVE(uint64_t ZPR, uint64_t 
PPR=0)` such that `HasCalculatedStackSizeSVE is set only once?

https://github.com/llvm/llvm-project/pull/142391
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)

2025-07-07 Thread Sander de Smalen via llvm-branch-commits


@@ -299,14 +297,20 @@ class AArch64FunctionInfo final : public 
MachineFunctionInfo {
 TailCallReservedStack = bytes;
   }
 
-  bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
+  void setStackSizeZPR(uint64_t S) {
+HasCalculatedStackSizeSVE = true;
+StackSizeZPR = S;
+  }
 
-  void setStackSizeSVE(uint64_t S) {
+  void setStackSizePPR(uint64_t S) {
 HasCalculatedStackSizeSVE = true;
-StackSizeSVE = S;
+StackSizePPR = S;
   }
 
-  uint64_t getStackSizeSVE() const { return StackSizeSVE; }
+  uint64_t getStackSizeZPR() const { return StackSizeZPR; }

sdesmalen-arm wrote:

not related to your PR, but I think we should add an assert that 
`HasCalculatedStackSizeSVE` is true (same for CalleeSavedStackSize), although 
unfortunately that currently leads to some failures where they're used.

https://github.com/llvm/llvm-project/pull/142391
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)

2025-07-07 Thread Sander de Smalen via llvm-branch-commits


@@ -451,10 +454,36 @@ static unsigned getFixedObjectSize(const MachineFunction 
&MF,
   }
 }
 
-/// Returns the size of the entire SVE stackframe (calleesaves + spills).
+static unsigned getStackHazardSize(const MachineFunction &MF) {

sdesmalen-arm wrote:

nit: maybe just move the implementation to where they are declared?

https://github.com/llvm/llvm-project/pull/142391
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)

2025-07-07 Thread Sander de Smalen via llvm-branch-commits


@@ -19,6 +19,11 @@
 
 namespace llvm {
 
+struct SVEStackSizes {

sdesmalen-arm wrote:

Should this be named `SVEStackOffsets` (given that they're used as signed 
offsets)?

https://github.com/llvm/llvm-project/pull/142391
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)

2025-07-07 Thread Sander de Smalen via llvm-branch-commits


@@ -644,7 +644,8 @@ bool AArch64RegisterInfo::hasBasePointer(const 
MachineFunction &MF) const {
 if (ST.hasSVE() || ST.isStreaming()) {
   // Frames that have variable sized objects and scalable SVE objects,
   // should always use a basepointer.
-  if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE())
+  if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeZPR() ||
+  AFI->getStackSizePPR())

sdesmalen-arm wrote:

nit:
```suggestion
  if (!AFI->hasCalculatedStackSizeSVE() || AFI->hasSVEStackSize())
```

https://github.com/llvm/llvm-project/pull/142391
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)

2025-07-07 Thread Sander de Smalen via llvm-branch-commits


@@ -1605,25 +1634,19 @@ static bool isTargetWindows(const MachineFunction &MF) {
   return MF.getSubtarget().isTargetWindows();
 }
 
-static unsigned getStackHazardSize(const MachineFunction &MF) {
-  return MF.getSubtarget().getStreamingHazardSize();
-}
-
 // Convenience function to determine whether I is an SVE callee save.
-static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
+static bool IsZPRCalleeSave(MachineBasicBlock::iterator I) {

sdesmalen-arm wrote:

nit: given that you're renaming these, what about calling them 
`isPartOfZPRCalleeSave` (because a `PTRUE_B` instruction is not a callee-save 
in itself)

https://github.com/llvm/llvm-project/pull/142391
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)

2025-07-07 Thread Sander de Smalen via llvm-branch-commits


@@ -4294,24 +4396,32 @@ static int64_t 
determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
   report_fatal_error(
   "Alignment of scalable vectors > 16 bytes is not yet supported");
 
+int64_t &Offset = OffsetForObject(FI, ZPROffset, PPROffset);
 Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
 if (AssignOffsets)
   Assign(FI, -Offset);
   }
 
-  return Offset;
+  PPROffset = alignTo(PPROffset, Align(16U));
+  ZPROffset = alignTo(ZPROffset, Align(16U));
+
+  if (&ZPROffset != &PPROffset) {
+// SplitSVEObjects (PPRs and ZPRs allocated to separate areas).
+return SVEStackSizes{ZPROffset, PPROffset};
+  }
+  // When SplitSVEObjects is disabled just attribute all the stack to ZPRs.
+  // Determining the split is not necessary.
+  return SVEStackSizes{ZPROffset, 0};

sdesmalen-arm wrote:

When you use an instance of the return type (`SVEStackSizes`) instead of 
`ZPRStack` and `PPRStack`, then you can just return that struct at the end of 
this function.

https://github.com/llvm/llvm-project/pull/142391
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)

2025-07-07 Thread Sander de Smalen via llvm-branch-commits


@@ -4227,10 +4310,20 @@ static bool getSVECalleeSaveSlotRange(const 
MachineFrameInfo &MFI,
 // Fills in the first and last callee-saved frame indices into
 // Min/MaxCSFrameIndex, respectively.
 // Returns the size of the stack.
-static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
-  int &MinCSFrameIndex,
-  int &MaxCSFrameIndex,
-  bool AssignOffsets) {
+static SVEStackSizes
+determineSVEStackObjectOffsets(MachineFunction &MF, bool AssignOffsets,
+   bool SplitSVEObjects = false) {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  int64_t ZPRStack = 0;
+  int64_t PPRStack = 0;
+
+  auto [ZPROffset, PPROffset] = [&] {
+if (SplitSVEObjects)
+  return std::tie(ZPRStack, PPRStack);
+return std::tie(ZPRStack, ZPRStack);
+  }();

sdesmalen-arm wrote:

This seems a lot more readable:
```suggestion
  int64_t &ZPROffset = ZPRStack; 
  int64_t &PPROffset = SplitSVEObjects ? PPRStack : ZPRStack;
```

Also, can you add a brief comment describing why you create two aliases?

https://github.com/llvm/llvm-project/pull/142391
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [HLSL][RootSignature] Implement diagnostic for missed comma (PR #147350)

2025-07-07 Thread Finn Plummer via llvm-branch-commits

inbelic wrote:

Contemplating if I should split this into two prs. Will see if there is a nice 
way to de-couple the improve and fix error portions of this.

https://github.com/llvm/llvm-project/pull/147350
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [HLSL][RootSignature] Implement diagnostic for missed comma (PR #147350)

2025-07-07 Thread Finn Plummer via llvm-branch-commits

https://github.com/inbelic converted_to_draft 
https://github.com/llvm/llvm-project/pull/147350
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LV] Bundle sub reductions into VPExpressionRecipe (PR #147255)

2025-07-07 Thread Sam Tebbs via llvm-branch-commits

https://github.com/SamTebbs33 created 
https://github.com/llvm/llvm-project/pull/147255

This PR bundles sub reductions into the VPExpressionRecipe class and adjusts 
the cost functions to take the negation into account.

>From 1a5f4e42e4f9d1eae0222302dcabdf08492f67c3 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs 
Date: Mon, 30 Jun 2025 14:29:54 +0100
Subject: [PATCH] [LV] Bundle sub reductions into VPExpressionRecipe

This PR bundles sub reductions into the VPExpressionRecipe class and
adjusts the cost functions to take the negation into account.
---
 .../llvm/Analysis/TargetTransformInfo.h   |   4 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   2 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h  |   3 +
 llvm/lib/Analysis/TargetTransformInfo.cpp |   5 +-
 .../AArch64/AArch64TargetTransformInfo.cpp|   7 +-
 .../AArch64/AArch64TargetTransformInfo.h  |   2 +-
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |   7 +-
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h  |   1 +
 .../Transforms/Vectorize/LoopVectorize.cpp|   6 +-
 llvm/lib/Transforms/Vectorize/VPlan.h |  11 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  35 -
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  33 ++--
 .../Transforms/Vectorize/VectorCombine.cpp|   4 +-
 .../vplan-printing-reductions.ll  | 143 ++
 14 files changed, 236 insertions(+), 27 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h 
b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index c43870392361d..3cc0ea01953c3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1645,8 +1645,10 @@ class TargetTransformInfo {
   /// extensions. This is the cost of as:
   /// ResTy vecreduce.add(mul (A, B)).
   /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)).
+  /// The multiply can optionally be negated, which signifies that it is a sub
+  /// reduction.
   LLVM_ABI InstructionCost getMulAccReductionCost(
-  bool IsUnsigned, Type *ResTy, VectorType *Ty,
+  bool IsUnsigned, Type *ResTy, VectorType *Ty, bool Negated,
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
 
   /// Calculate the cost of an extended reduction pattern, similar to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h 
b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 12f87226c5f57..fd22981a5dbf3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -960,7 +960,7 @@ class TargetTransformInfoImplBase {
 
   virtual InstructionCost
   getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty,
- TTI::TargetCostKind CostKind) const {
+ bool Negated, TTI::TargetCostKind CostKind) const {
 return 1;
   }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h 
b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index bf958e100f2ac..a9c9fa6d1db0d 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -3116,7 +3116,10 @@ class BasicTTIImplBase : public 
TargetTransformInfoImplCRTPBase {
 
   InstructionCost
   getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty,
+ bool Negated,
  TTI::TargetCostKind CostKind) const override {
+if (Negated)
+  return InstructionCost::getInvalid(CostKind);
 // Without any native support, this is equivalent to the cost of
 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
 // vecreduce.add(mul(A, B)).
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp 
b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3ebd9d487ba04..ba0d070bffe6d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1274,9 +1274,10 @@ InstructionCost 
TargetTransformInfo::getExtendedReductionCost(
 }
 
 InstructionCost TargetTransformInfo::getMulAccReductionCost(
-bool IsUnsigned, Type *ResTy, VectorType *Ty,
+bool IsUnsigned, Type *ResTy, VectorType *Ty, bool Negated,
 TTI::TargetCostKind CostKind) const {
-  return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, CostKind);
+  return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, Negated,
+ CostKind);
 }
 
 InstructionCost
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp 
b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 380faa6cf6939..d9a367535baf4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5316,8 +5316,10 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost(
 
 InstructionCost
 AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
-   VectorType *VecTy,
+   VectorType *VecTy, bo

[llvm-branch-commits] [llvm] [LV] Bundle sub reductions into VPExpressionRecipe (PR #147255)

2025-07-07 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-arm

Author: Sam Tebbs (SamTebbs33)


Changes

This PR bundles sub reductions into the VPExpressionRecipe class and adjusts 
the cost functions to take the negation into account.

---

Patch is 23.85 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/147255.diff


14 Files Affected:

- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+3-1) 
- (modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+1-1) 
- (modified) llvm/include/llvm/CodeGen/BasicTTIImpl.h (+3) 
- (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+3-2) 
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+5-2) 
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h (+1-1) 
- (modified) llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp (+5-2) 
- (modified) llvm/lib/Target/ARM/ARMTargetTransformInfo.h (+1) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+3-3) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+11) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+32-3) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+23-10) 
- (modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll 
(+143) 


``diff
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h 
b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index c43870392361d..3cc0ea01953c3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1645,8 +1645,10 @@ class TargetTransformInfo {
   /// extensions. This is the cost of as:
   /// ResTy vecreduce.add(mul (A, B)).
   /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)).
+  /// The multiply can optionally be negated, which signifies that it is a sub
+  /// reduction.
   LLVM_ABI InstructionCost getMulAccReductionCost(
-  bool IsUnsigned, Type *ResTy, VectorType *Ty,
+  bool IsUnsigned, Type *ResTy, VectorType *Ty, bool Negated,
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
 
   /// Calculate the cost of an extended reduction pattern, similar to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h 
b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 12f87226c5f57..fd22981a5dbf3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -960,7 +960,7 @@ class TargetTransformInfoImplBase {
 
   virtual InstructionCost
   getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty,
- TTI::TargetCostKind CostKind) const {
+ bool Negated, TTI::TargetCostKind CostKind) const {
 return 1;
   }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h 
b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index bf958e100f2ac..a9c9fa6d1db0d 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -3116,7 +3116,10 @@ class BasicTTIImplBase : public 
TargetTransformInfoImplCRTPBase {
 
   InstructionCost
   getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty,
+ bool Negated,
  TTI::TargetCostKind CostKind) const override {
+if (Negated)
+  return InstructionCost::getInvalid(CostKind);
 // Without any native support, this is equivalent to the cost of
 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
 // vecreduce.add(mul(A, B)).
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp 
b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3ebd9d487ba04..ba0d070bffe6d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1274,9 +1274,10 @@ InstructionCost 
TargetTransformInfo::getExtendedReductionCost(
 }
 
 InstructionCost TargetTransformInfo::getMulAccReductionCost(
-bool IsUnsigned, Type *ResTy, VectorType *Ty,
+bool IsUnsigned, Type *ResTy, VectorType *Ty, bool Negated,
 TTI::TargetCostKind CostKind) const {
-  return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, CostKind);
+  return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, Negated,
+ CostKind);
 }
 
 InstructionCost
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp 
b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 380faa6cf6939..d9a367535baf4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5316,8 +5316,10 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost(
 
 InstructionCost
 AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
-   VectorType *VecTy,
+   VectorType *VecTy, bool Negated,
  

  1   2   >