[llvm-branch-commits] [llvm] [AMDGPU][NPM] Complete optimized regalloc pipeline (PR #138491)
@@ -2174,7 +2174,44 @@ void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization( addPass(SIShrinkInstructionsPass()); } +void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( +AddMachinePass &addPass) const { + if (EnableDCEInRA) +insertPass(DeadMachineInstructionElimPass()); + + // FIXME: when an instruction has a Killed operand, and the instruction is + // inside a bundle, seems only the BUNDLE instruction appears as the Kills of + // the register in LiveVariables, this would trigger a failure in verifier, + // we should fix it and enable the verifier. + if (OptVGPRLiveRange) +insertPass>( optimisan wrote: Since we are moving to the callback style TargetPassBuilder design, I am keeping this the same as legacy. https://github.com/llvm/llvm-project/pull/138491 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][NPM] Complete optimized regalloc pipeline (PR #138491)
https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/138491 >From 3d1996a1d347eb14d6908d789307c0a3eef0568c Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Mon, 5 May 2025 06:30:03 + Subject: [PATCH] [AMDGPU][NPM] Complete optimized regalloc pipeline Also fill in some other passes. --- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 41 +-- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 1 + llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 7 +++- 4 files changed, 45 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index a3b19af4adc39..29bc432ba3d5d 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -574,7 +574,7 @@ template class CodeGenPassBuilder { /// Insert InsertedPass pass after TargetPass pass. /// Only machine function passes are supported. template - void insertPass(InsertedPassT &&Pass) { + void insertPass(InsertedPassT &&Pass) const { AfterCallbacks.emplace_back( [&](StringRef Name, MachineFunctionPassManager &MFPM) mutable { if (Name == TargetPassT::name()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 09b40c9173ff6..3f325398752a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -2192,7 +2192,44 @@ void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization( addPass(SIShrinkInstructionsPass()); } +void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( +AddMachinePass &addPass) const { + if (EnableDCEInRA) +insertPass(DeadMachineInstructionElimPass()); + + // FIXME: when an instruction has a Killed operand, and the instruction is + // inside a bundle, seems only the BUNDLE instruction appears as the Kills of + // the register in LiveVariables, this would trigger a failure in verifier, + // we should fix it and enable the verifier. + if (OptVGPRLiveRange) +insertPass>( +SIOptimizeVGPRLiveRangePass()); + + // This must be run immediately after phi elimination and before + // TwoAddressInstructions, otherwise the processing of the tied operand of + // SI_ELSE will introduce a copy of the tied operand source after the else. + insertPass(SILowerControlFlowPass()); + + if (EnableRewritePartialRegUses) +insertPass(GCNRewritePartialRegUsesPass()); + + if (isPassEnabled(EnablePreRAOptimizations)) +insertPass(GCNPreRAOptimizationsPass()); + // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation + // instructions that cause scheduling barriers. + insertPass(SIWholeQuadModePass()); + + if (OptExecMaskPreRA) +insertPass(SIOptimizeExecMaskingPreRAPass()); + + // This is not an essential optimization and it has a noticeable impact on + // compilation time, so we only enable it from O2. + if (TM.getOptLevel() > CodeGenOptLevel::Less) +insertPass(SIFormMemoryClausesPass()); + + Base::addOptimizedRegAlloc(addPass); +} Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized( AddMachinePass &addPass) const { @@ -2220,21 +2257,19 @@ Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized( addPass(SIPreAllocateWWMRegsPass()); // For allocating other wwm register operands. - // addRegAlloc(addPass, RegAllocPhase::WWM); addPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"})); addPass(SILowerWWMCopiesPass()); addPass(VirtRegRewriterPass(false)); addPass(AMDGPUReserveWWMRegsPass()); // For allocating per-thread VGPRs. - // addRegAlloc(addPass, RegAllocPhase::VGPR); addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"})); addPreRewrite(addPass); addPass(VirtRegRewriterPass(true)); - // TODO: addPass(AMDGPUMarkLastScratchLoadPass()); + addPass(AMDGPUMarkLastScratchLoadPass()); return Error::success(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 589123274d0f5..3c62cd19c6e57 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -182,6 +182,7 @@ class AMDGPUCodeGenPassBuilder void addPostRegAlloc(AddMachinePass &) const; void addPreEmitPass(AddMachinePass &) const; Error addRegAssignmentOptimized(AddMachinePass &) const; + void addOptimizedRegAlloc(AddMachinePass &) const; /// Check if a pass is enabled given \p Opt option. The option always /// overrides defaults if explicitly used. Otherwise its default will be used diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 5155ec212c12f..0fa4619be53df 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -8,8 +8,11 @@ ; RUN: | FileCheck -check-prefix=GC
[llvm-branch-commits] [llvm] [AMDGPU][NPM] Complete optimized regalloc pipeline (PR #138491)
https://github.com/cdevadas approved this pull request. https://github.com/llvm/llvm-project/pull/138491 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen][NPM] Port InitUndef to NPM (PR #138495)
https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/138495 >From 7938c63eb0883f47cffd02219400780ede01e559 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Mon, 5 May 2025 08:47:42 + Subject: [PATCH 1/3] [CodeGen][NPM] Port InitUndef to NPM --- llvm/include/llvm/CodeGen/InitUndef.h | 24 + llvm/include/llvm/InitializePasses.h | 2 +- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 1 + .../llvm/Passes/MachinePassRegistry.def | 2 +- llvm/lib/CodeGen/CodeGen.cpp | 2 +- llvm/lib/CodeGen/InitUndef.cpp| 50 +-- llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/test/CodeGen/AArch64/init-undef.mir | 3 ++ llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 6 +-- .../rvv/handle-noreg-with-implicit-def.mir| 2 + .../rvv/subregister-undef-early-clobber.mir | 1 + .../RISCV/rvv/undef-earlyclobber-chain.mir| 1 + 12 files changed, 74 insertions(+), 21 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/InitUndef.h diff --git a/llvm/include/llvm/CodeGen/InitUndef.h b/llvm/include/llvm/CodeGen/InitUndef.h new file mode 100644 index 0..7274824a74905 --- /dev/null +++ b/llvm/include/llvm/CodeGen/InitUndef.h @@ -0,0 +1,24 @@ +//===- llvm/CodeGen/InitUndef.h *- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#ifndef LLVM_CODEGEN_INITUNDEF_H +#define LLVM_CODEGEN_INITUNDEF_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class InitUndefPass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, +MachineFunctionAnalysisManager &MFAM); +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_INITUNDEF_H diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 1b5b1d524..20462288ef667 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -326,7 +326,7 @@ LLVM_ABI void initializeTargetTransformInfoWrapperPassPass(PassRegistry &); LLVM_ABI void initializeTwoAddressInstructionLegacyPassPass(PassRegistry &); LLVM_ABI void initializeTypeBasedAAWrapperPassPass(PassRegistry &); LLVM_ABI void initializeTypePromotionLegacyPass(PassRegistry &); -LLVM_ABI void initializeInitUndefPass(PassRegistry &); +LLVM_ABI void initializeInitUndefLegacyPass(PassRegistry &); LLVM_ABI void initializeUniformityInfoWrapperPassPass(PassRegistry &); LLVM_ABI void initializeUnifyLoopExitsLegacyPassPass(PassRegistry &); LLVM_ABI void initializeUnpackMachineBundlesPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 29bc432ba3d5d..a3f439119b7da 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -43,6 +43,7 @@ #include "llvm/CodeGen/GlobalMerge.h" #include "llvm/CodeGen/GlobalMergeFunctions.h" #include "llvm/CodeGen/IndirectBrExpand.h" +#include "llvm/CodeGen/InitUndef.h" #include "llvm/CodeGen/InterleavedAccess.h" #include "llvm/CodeGen/InterleavedLoadCombine.h" #include "llvm/CodeGen/JMCInstrumenter.h" diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 518dc55acb99b..e03038921af99 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -149,6 +149,7 @@ MACHINE_FUNCTION_PASS("early-tailduplication", EarlyTailDuplicatePass()) MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass()) MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass()) MACHINE_FUNCTION_PASS("fixup-statepoint-caller-saved", FixupStatepointCallerSavedPass()) +MACHINE_FUNCTION_PASS("init-undef", InitUndefPass()) MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass()) MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass()) MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass()) @@ -306,7 +307,6 @@ DUMMY_MACHINE_FUNCTION_PASS("fs-profile-loader", MIRProfileLoaderNewPass) DUMMY_MACHINE_FUNCTION_PASS("funclet-layout", FuncletLayoutPass) DUMMY_MACHINE_FUNCTION_PASS("gc-empty-basic-blocks", GCEmptyBasicBlocksPass) DUMMY_MACHINE_FUNCTION_PASS("implicit-null-checks", ImplicitNullChecksPass) -DUMMY_MACHINE_FUNCTION_PASS("init-undef-pass", InitUndefPass) DUMMY_MACHINE_FUNCTION_PASS("instruction-select", InstructionSelectPass) DUMMY_MACHINE_FUNCTION_PASS("irtranslator", IRTranslatorPass) DUMMY_MACHINE_FUNCTION_PASS("kcfi", MachineKCFIPass) diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 5250534d8a4e4..aa3591cb6be58 100644 --- a/llvm/lib/CodeGen/CodeG
[llvm-branch-commits] [libc] [libc] Modular printf option (float only) (PR #147426)
@@ -0,0 +1,41 @@ +#ifdef LIBC_COPT_PRINTF_MODULAR petrhosek wrote: This file needs the copyright header. https://github.com/llvm/llvm-project/pull/147426 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [AArch64][SME] Fix restoring callee-saves from FP with hazard padding (PR #144693)
MacDue wrote: I think it's reasonably safe given the general case (without hazard padding) is well used and tested, and there's been no issues reported since this landed a few weeks back. https://github.com/llvm/llvm-project/pull/144693 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [LoongArch] Pass OptLevel to LoongArchDAGToDAGISel correctly (PR #144459)
leecheechen wrote: Fixed a crash caused by incorrectly passing OptLevel to LoongArchDAGToDAGISel. https://github.com/llvm/llvm-project/pull/144459 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
llvmbot wrote: @llvm/pr-subscribers-llvm-globalisel Author: Sameer Sahasrabuddhe (ssahasra) Changes The memory legalizer is currently responsible for emitting wait instructions at ordering operations such as acquire and release. It tries to be efficient by emitting waits only when required. In particular, it does not emit a wait on vmcnt at workgroup scope since that ordering is already guaranteed by the architecture. But this is now incorrect because direct loads to LDS have an LDS component which needs explicit ordering on vmcnt. But it is inefficient to always emit a wait on vmcnt since majority of the programs do not use direct loads to LDS, and this will affect all workgroup scope operations. As a first step to that, the memory legalizer now emits a soft wait instruction even if all counts are trivially ~0. This is a placeholder that the SIInsertWaitcnts pass will either optimize away or strenghthen based on its analysis of whether direct loads to LDS are pending at this point in the program. --- Patch is 4.42 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147257.diff 41 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+25-33) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll (+112) - (modified) llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+66-66) - (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+2) - (modified) llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll (+64) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll (+168-6) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll (+220-4) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll (+160-32) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll (+1420) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll (+160-32) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (+14-2) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll (+1410) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll (+576-68) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll (+192) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll (+1152-52) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll (+168) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll (+14-1) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll (+1152-52) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll (+706-82) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll (+940-98) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll (+3-2) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll (+1548) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll (+940-98) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll (+27-7) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll (+1548) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll (+940-98) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir (+31) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll (+12) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir (+31) - (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll (+6) - (modified) llvm/test/CodeGen/AMDGPU/trap-abis.ll (+5) - (modified) llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir (+1) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 3212060f303a5..f015d3ad7811e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1074,8 +1074,6 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, A
[llvm-branch-commits] [llvm] [LV] Use VPReductionRecipe for partial reductions (PR #146073)
@@ -2744,6 +2702,12 @@ class VPSingleDefBundleRecipe : public VPSingleDefRecipe { /// vector operands, performing a reduction.add on the result, and adding /// the scalar result to a chain. MulAccumulateReduction, +/// Represent an inloop multiply-accumulate reduction, multiplying the +/// extended vector operands, negating the multiplication, performing a +/// reduction.add +/// on the result, and adding +/// the scalar result to a chain. +ExtNegatedMulAccumulateReduction, SamTebbs33 wrote: Thanks Florian, that sounds like a good approach. https://github.com/llvm/llvm-project/pull/146073 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Sameer Sahasrabuddhe (ssahasra) Changes The memory legalizer is currently responsible for emitting wait instructions at ordering operations such as acquire and release. It tries to be efficient by emitting waits only when required. In particular, it does not emit a wait on vmcnt at workgroup scope since that ordering is already guaranteed by the architecture. But this is now incorrect because direct loads to LDS have an LDS component which needs explicit ordering on vmcnt. But it is inefficient to always emit a wait on vmcnt since majority of the programs do not use direct loads to LDS, and this will affect all workgroup scope operations. As a first step to that, the memory legalizer now emits a soft wait instruction even if all counts are trivially ~0. This is a placeholder that the SIInsertWaitcnts pass will either optimize away or strenghthen based on its analysis of whether direct loads to LDS are pending at this point in the program. --- Patch is 4.42 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147257.diff 41 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+25-33) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll (+112) - (modified) llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+66-66) - (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+2) - (modified) llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll (+64) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll (+168-6) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll (+220-4) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll (+160-32) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll (+1420) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll (+160-32) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (+14-2) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll (+1410) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll (+576-68) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll (+192) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll (+1152-52) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll (+168) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll (+14-1) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll (+1152-52) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll (+706-82) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll (+940-98) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll (+3-2) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll (+1548) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll (+940-98) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll (+27-7) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll (+1548) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll (+940-98) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir (+31) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll (+12) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir (+31) - (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll (+6) - (modified) llvm/test/CodeGen/AMDGPU/trap-abis.ll (+5) - (modified) llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir (+1) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 3212060f303a5..f015d3ad7811e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1074,8 +1074,6 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, At
[llvm-branch-commits] [llvm] [AMDGPU] efficiently wait for direct loads to LDS at all scopes (PR #147258)
https://github.com/ssahasra created https://github.com/llvm/llvm-project/pull/147258 Currently, the memory legalizer does not generate any wait on vmcnt at workgroup scope. This is incorrect because direct loads to LDS are tracked using vmcnt and they need to be released properly at workgroup scope. The memory legalizer was previously updated to always emit a soft wait instruction even when all counts are trivially ~0. SIInsertWaitcnts now examines pending loads to LDS at each S_WAITCNT_soft instruction. If such instructions exist, the vmcnt (which could be ~0) is upgraded to a value that wiats for any such pending loads to LDS. After that, any soft instruction that has only trivial ~0 counts is automatically dropped. Thus, common programs that do not use direct loads to LDS remain unaffected, but programs that do use such loads see a correct and efficient vmcnt even at workgroup scope. >From de111cd96570df7127722cb7df476cb833694f72 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Tue, 17 Jun 2025 13:11:55 +0530 Subject: [PATCH 1/2] [AMDGCN] pre-checkin test for LDS DMA and release operations --- .../AMDGPU/lds-dma-workgroup-release.ll | 482 ++ 1 file changed, 482 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll new file mode 100644 index 0..1db15c3c6099c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -0,0 +1,482 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GFX900 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX90A-TGSPLIT +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942 +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX942-TGSPLIT +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=GFX1010 + +; In each of these tests, an LDS DMA operation is followed by a release pattern +; at workgroup scope. The fence in such a release (implicit or explicit) should +; wait for the store component in the LDS DMA. The additional noalias metadata +; is just meant to ensure that the wait counts are not generated due to some +; unintended aliasing. + +declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) + +define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, +; GFX900-LABEL: barrier_release: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT:s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX900-NEXT:v_mov_b32_e32 v0, 0x800 +; GFX900-NEXT:v_mov_b32_e32 v1, 0 +; GFX900-NEXT:s_waitcnt lgkmcnt(0) +; GFX900-NEXT:s_mov_b32 m0, s12 +; GFX900-NEXT:s_nop 0 +; GFX900-NEXT:buffer_load_dword v0, s[8:11], 0 offen lds +; GFX900-NEXT:v_mov_b32_e32 v0, s13 +; GFX900-NEXT:s_waitcnt vmcnt(0) +; GFX900-NEXT:s_barrier +; GFX900-NEXT:ds_read_b32 v0, v0 +; GFX900-NEXT:s_waitcnt lgkmcnt(0) +; GFX900-NEXT:global_store_dword v1, v0, s[14:15] +; GFX900-NEXT:s_endpgm +; +; GFX90A-LABEL: barrier_release: +; GFX90A: ; %bb.1: +; GFX90A-NEXT:s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT:s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT:s_waitcnt lgkmcnt(0) +; GFX90A-NEXT:s_branch .LBB0_0 +; GFX90A-NEXT:.p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB0_0: ; %main_body +; GFX90A-NEXT:s_mov_b32 m0, s12 +; GFX90A-NEXT:v_mov_b32_e32 v0, 0x800 +; GFX90A-NEXT:buffer_load_dword v0, s[8:11], 0 offen lds +; GFX90A-NEXT:v_mov_b32_e32 v0, s13 +; GFX90A-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX90A-NEXT:s_waitcnt lgkmcnt(0) +; GFX90A-NEXT:s_barrier +; GFX90A-NEXT:s_waitcnt vmcnt(0) +; GFX90A-NEXT:ds_read_b32 v0, v0 +; GFX90A-NEXT:v_mov_b32_e32 v1, 0 +; GFX90A-NEXT:s_waitcnt lgkmcnt(0) +; GFX90A-NEXT:global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT:s_endpgm +; +; GFX90A-TGSPLIT-LABEL: barrier_release: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT:s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT:s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT:s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT:s_branch .LBB0_0 +; GFX90A-TGSPLIT-NEXT:.p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB0_0: ; %main_body +; GFX90A-TGSPLIT-NEXT:s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT:v_mov_b32_e32 v0, 0x800 +; GFX90A-TGSPLIT-NEXT:buffer_load_dword v0, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT:v_mov_b32_e32 v0, s13 +; G
[llvm-branch-commits] [llvm] [AMDGPU] efficiently wait for direct loads to LDS at all scopes (PR #147258)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Sameer Sahasrabuddhe (ssahasra) Changes Currently, the memory legalizer does not generate any wait on vmcnt at workgroup scope. This is incorrect because direct loads to LDS are tracked using vmcnt and they need to be released properly at workgroup scope. The memory legalizer was previously updated to always emit a soft wait instruction even when all counts are trivially ~0. SIInsertWaitcnts now examines pending loads to LDS at each S_WAITCNT_soft instruction. If such instructions exist, the vmcnt (which could be ~0) is upgraded to a value that wiats for any such pending loads to LDS. After that, any soft instruction that has only trivial ~0 counts is automatically dropped. Thus, common programs that do not use direct loads to LDS remain unaffected, but programs that do use such loads see a correct and efficient vmcnt even at workgroup scope. --- Patch is 22.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147258.diff 2 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+13) - (added) llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll (+482) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7ce1359f03da6..b57cfe5d6f2c5 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1374,6 +1374,19 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); + if (!WaitcntInstr && II.getOpcode() == AMDGPU::S_WAITCNT_soft) { +// Each direct load to LDS is also a store to LDS, but we do not have a +// separate counter for it. Instead these operations increment LOAD_CNT +// and need to be waited for at a release fence. So we treat a release +// fence as if it depends on any previous LDS DMA stores. +// +// Note that a user-specified S_WAITCNT instruction is not affected; we +// only check for S_WAITCNT_soft since that represents a fence. +// +// FIXME: How does one detect that a soft wait is a release??? +ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); + } + // Merge consecutive waitcnt of the same type by erasing multiples. if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) { II.eraseFromParent(); diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll new file mode 100644 index 0..882c43b41bac8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -0,0 +1,482 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GFX900 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX90A-TGSPLIT +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942 +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX942-TGSPLIT +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=GFX1010 + +; In each of these tests, an LDS DMA operation is followed by a release pattern +; at workgroup scope. The fence in such a release (implicit or explicit) should +; wait for the store component in the LDS DMA. The additional noalias metadata +; is just meant to ensure that the wait counts are not generated due to some +; unintended aliasing. + +declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) + +define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, +; GFX900-LABEL: barrier_release: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT:s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX900-NEXT:v_mov_b32_e32 v0, 0x800 +; GFX900-NEXT:v_mov_b32_e32 v1, 0 +; GFX900-NEXT:s_waitcnt lgkmcnt(0) +; GFX900-NEXT:s_mov_b32 m0, s12 +; GFX900-NEXT:s_nop 0 +; GFX900-NEXT:buffer_load_dword v0, s[8:11], 0 offen lds +; GFX900-NEXT:v_mov_b32_e32 v0, s13 +; GFX900-NEXT:s_waitcnt vmcnt(0) +; GFX900-NEXT:s_barrier +; GFX900-NEXT:ds_read_b32 v0, v0 +; GFX900-NEXT:s_waitcnt lgkmcnt(0) +; GFX900-NEXT:global_store_dword v1, v0, s[14:15] +; GFX900-NEXT:s_endpgm +; +; GFX90A-LABEL: barrier_release: +; GFX90A: ; %bb.1: +; GFX90A-NEXT:s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT:s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT:s_waitcnt lgkmcnt(0) +; GFX90A-NEXT:s_branch .LBB0_0 +; GFX90A-NEXT:.p2align 8 +; GFX90A-NEXT: ; %bb.2:
[llvm-branch-commits] [llvm] [SelectionDAG] Deal with POISON for INSERT_VECTOR_ELT/INSERT_SUBVECTOR (part 3) (PR #143105)
@@ -953,8 +953,17 @@ class SelectionDAG { } /// Insert \p SubVec at the \p Idx element of \p Vec. + /// If \p SkipUndef is true and \p SubVec is UNDEF/POISON, then \p Vec is + /// returned. arsenm wrote: This is a strange pattern that I feel like should be avoided. I don't understand why this would need semantic treatment. These get* functions should just return with the node with the requested operands https://github.com/llvm/llvm-project/pull/143105 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] ARM: Remove subtarget field tracking SjLj (PR #147226)
https://github.com/DanielKristofKiss approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/147226 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Add canonical loop LLVM-IR lowering (PR #147069)
https://github.com/Meinersbur edited https://github.com/llvm/llvm-project/pull/147069 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f ssahasra wrote: Not directly related to this discussion, but this line does exist: ``` 1390 // Merge consecutive waitcnt of the same type by erasing multiples. 1391 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) { ``` It is meant to preserver S_WAITCNT_soft even if there is no actual wait required. @jayfoad , you had introduced `TrySimplify` ... do you think it is okay to relax its uses? ``` 1373 if (TrySimplify **|| (Opcode != II.getOpcode() && OldWait.hasValuesSetToMax()**) 1374 ScoreBrackets.simplifyWaitcnt(OldWait); ``` Here, `hasValuesSetToMax()` is a hypothetical function that checks the encoding of each count separately to have all bits set to 1, and not just a ~0 in the data structure. https://github.com/llvm/llvm-project/pull/147257 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
https://github.com/ssahasra edited https://github.com/llvm/llvm-project/pull/147257 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Add canonical loop LLVM-IR lowering (PR #147069)
https://github.com/Meinersbur updated https://github.com/llvm/llvm-project/pull/147069 >From da2613d525deb4edcf0fac41e865ca0510c75210 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Fri, 4 Jul 2025 16:26:20 +0200 Subject: [PATCH] omp.canonical_loop and omp.unroll_heuristic lowering --- .../mlir/Target/LLVMIR/ModuleTranslation.h| 43 + .../Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp | 10 + .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 78 .../LLVMIR/openmp-cli-canonical_loop.mlir | 175 ++ .../LLVMIR/openmp-cli-unroll-heuristic01.mlir | 56 ++ .../LLVMIR/openmp-cli-unroll-heuristic02.mlir | 93 ++ 6 files changed, 455 insertions(+) create mode 100644 mlir/test/Target/LLVMIR/openmp-cli-canonical_loop.mlir create mode 100644 mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic01.mlir create mode 100644 mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic02.mlir diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h index 79e8bb6add0da..5d52cf3f04b6a 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h @@ -15,6 +15,7 @@ #define MLIR_TARGET_LLVMIR_MODULETRANSLATION_H #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/Operation.h" #include "mlir/IR/SymbolTable.h" #include "mlir/IR/Value.h" @@ -24,6 +25,7 @@ #include "mlir/Target/LLVMIR/TypeToLLVM.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/FPEnv.h" namespace llvm { @@ -108,6 +110,41 @@ class ModuleTranslation { return blockMapping.lookup(block); } + /// Find the LLVM-IR loop that represents an MLIR loop. + llvm::CanonicalLoopInfo *lookupOMPLoop(omp::NewCliOp mlir) const { +llvm::CanonicalLoopInfo *result = loopMapping.lookup(mlir); +assert(result && "attempt to get non-existing loop"); +return result; + } + + /// Find the LLVM-IR loop that represents an MLIR loop. + llvm::CanonicalLoopInfo *lookupOMPLoop(Value mlir) const { +return lookupOMPLoop(mlir.getDefiningOp()); + } + + /// Mark an OpenMP loop as having been consumed. + void invalidateOmpLoop(omp::NewCliOp mlir) { loopMapping.erase(mlir); } + + /// Mark an OpenMP loop as having been consumed. + void invalidateOmpLoop(Value mlir) { +invalidateOmpLoop(mlir.getDefiningOp()); + } + + /// Map an MLIR OpenMP dialect CanonicalLoopInfo to its lowered LLVM-IR + /// OpenMPIRBuilder CanonicalLoopInfo + void mapOmpLoop(omp::NewCliOp mlir, llvm::CanonicalLoopInfo *llvm) { +assert(llvm && "argument must be non-null"); +llvm::CanonicalLoopInfo *&cur = loopMapping[mlir]; +assert(cur == nullptr && "attempting to map a loop that is already mapped"); +cur = llvm; + } + + /// Map an MLIR OpenMP dialect CanonicalLoopInfo to its lowered LLVM-IR + /// OpenMPIRBuilder CanonicalLoopInfo + void mapOmpLoop(Value mlir, llvm::CanonicalLoopInfo *llvm) { +mapOmpLoop(mlir.getDefiningOp(), llvm); + } + /// Stores the mapping between an MLIR operation with successors and a /// corresponding LLVM IR instruction. void mapBranch(Operation *mlir, llvm::Instruction *llvm) { @@ -381,6 +418,12 @@ class ModuleTranslation { DenseMap valueMapping; DenseMap blockMapping; + /// List of not yet consumed MLIR loop handles (represented by an omp.new_cli + /// operation which creates a value of type CanonicalLoopInfoType) and their + /// LLVM-IR representation as CanonicalLoopInfo which is managed by the + /// OpenMPIRBuilder. + DenseMap loopMapping; + /// A mapping between MLIR LLVM dialect terminators and LLVM IR terminators /// they are converted to. This allows for connecting PHI nodes to the source /// values after all operations are converted. diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp index 7a0a7f86bc1e9..e77c4a0b94de9 100644 --- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp +++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp @@ -42,6 +42,16 @@ template struct OpenMPOpConversion : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + OpenMPOpConversion(LLVMTypeConverter &typeConverter, + PatternBenefit benefit = 1) + : ConvertOpToLLVMPattern(typeConverter, benefit) { +// Operations using CanonicalLoopInfoType are lowered only by +// mlir::translateModuleToLLVMIR() using the OpenMPIRBuilder. Until then, +// the type and operations using it must be preserved. +typeConverter.addConversion( +[&](::mlir::omp::CanonicalLoopInfoType type) { return type; }); + } + LogicalResult matchAndRewrite(T op, typename T::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP
[llvm-branch-commits] [clang] [LifetimeSafety] Propagate loans using dataflow analysis (PR #147295)
https://github.com/usx95 updated https://github.com/llvm/llvm-project/pull/147295 >From dd2dd838f44ebeb6d45f75af2934159ee61b385b Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Sun, 6 Jul 2025 19:12:55 + Subject: [PATCH] [LifetimeSafety] Propagate loans using dataflow analysis --- clang/lib/Analysis/LifetimeSafety.cpp | 255 +- .../Sema/warn-lifetime-safety-dataflow.cpp| 186 + 2 files changed, 440 insertions(+), 1 deletion(-) diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp index 2c2309de90e26..cdbab31ac7a9c 100644 --- a/clang/lib/Analysis/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety.cpp @@ -482,7 +482,247 @@ class FactGenerator : public ConstStmtVisitor { }; // = // -// TODO: Run dataflow analysis to propagate loans, analyse and error reporting. +// The Dataflow Lattice +// = // + +// Using LLVM's immutable collections is efficient for dataflow analysis +// as it avoids deep copies during state transitions. +// TODO(opt): Consider using a bitset to represent the set of loans. +using LoanSet = llvm::ImmutableSet; +using OriginLoanMap = llvm::ImmutableMap; + +/// An object to hold the factories for immutable collections, ensuring +/// that all created states share the same underlying memory management. +struct LifetimeFactory { + OriginLoanMap::Factory OriginMapFact; + LoanSet::Factory LoanSetFact; + + LoanSet createLoanSet(LoanID LID) { +return LoanSetFact.add(LoanSetFact.getEmptySet(), LID); + } +}; + +/// LifetimeLattice represents the state of our analysis at a given program +/// point. It is an immutable object, and all operations produce a new +/// instance rather than modifying the existing one. +struct LifetimeLattice { + /// The map from an origin to the set of loans it contains. + /// TODO(opt): To reduce the lattice size, propagate origins of declarations, + /// not expressions, because expressions are not visible across blocks. + OriginLoanMap Origins = OriginLoanMap(nullptr); + + explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {} + LifetimeLattice() = default; + + bool operator==(const LifetimeLattice &Other) const { +return Origins == Other.Origins; + } + bool operator!=(const LifetimeLattice &Other) const { +return !(*this == Other); + } + + LoanSet getLoans(OriginID OID, LifetimeFactory &Factory) const { +if (auto *Loans = Origins.lookup(OID)) + return *Loans; +return Factory.LoanSetFact.getEmptySet(); + } + + /// Computes the union of two lattices by performing a key-wise join of + /// their OriginLoanMaps. + // TODO(opt): This key-wise join is a performance bottleneck. A more + // efficient merge could be implemented using a Patricia Trie or HAMT + // instead of the current AVL-tree-based ImmutableMap. + LifetimeLattice join(const LifetimeLattice &Other, + LifetimeFactory &Factory) const { +/// Merge the smaller map into the larger one ensuring we iterate over the +/// smaller map. +if (Origins.getHeight() < Other.Origins.getHeight()) + return Other.join(*this, Factory); + +OriginLoanMap JoinedState = Origins; +// For each origin in the other map, union its loan set with ours. +for (const auto &Entry : Other.Origins) { + OriginID OID = Entry.first; + LoanSet OtherLoanSet = Entry.second; + JoinedState = Factory.OriginMapFact.add( + JoinedState, OID, + join(getLoans(OID, Factory), OtherLoanSet, Factory)); +} +return LifetimeLattice(JoinedState); + } + + LoanSet join(LoanSet a, LoanSet b, LifetimeFactory &Factory) const { +/// Merge the smaller set into the larger one ensuring we iterate over the +/// smaller set. +if (a.getHeight() < b.getHeight()) + std::swap(a, b); +LoanSet Result = a; +for (LoanID LID : b) { + /// TODO(opt): Profiling shows that this loop is a major performance + /// bottleneck. Investigate using a BitVector to represent the set of + /// loans for improved join performance. + Result = Factory.LoanSetFact.add(Result, LID); +} +return Result; + } + + void dump(llvm::raw_ostream &OS) const { +OS << "LifetimeLattice State:\n"; +if (Origins.isEmpty()) + OS << " \n"; +for (const auto &Entry : Origins) { + if (Entry.second.isEmpty()) +OS << " Origin " << Entry.first << " contains no loans\n"; + for (const LoanID &LID : Entry.second) +OS << " Origin " << Entry.first << " contains Loan " << LID << "\n"; +} + } +}; + +// = // +// The Transfer Function +// = // +class T
[llvm-branch-commits] [clang] [LifetimeSafety] Implement dataflow analysis for loan propagation (PR #147295)
https://github.com/usx95 edited https://github.com/llvm/llvm-project/pull/147295 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Add canonical loop LLVM-IR lowering (PR #147069)
llvmbot wrote: @llvm/pr-subscribers-mlir-llvm @llvm/pr-subscribers-mlir Author: Michael Kruse (Meinersbur) Changes Support for translating the operations introduced in #144785 to LLVM-IR. In order to keep the lowering simple, `OpenMPIRBuider::unrollLoopHeuristic` is applied when encountering the `omp.unroll_heuristic` op. As a result, the operation that unrolling is applied to (`omp.canonical_loop`) must have been emitted before even though logically there is no such requirement. Eventually, all transformations on a loop must be applied directly after emitting `omp.canonical_loop`, i.e. future transformations must be looked-up when encountering `omp.canonical_loop` itself. This is because many OpenMPIRBuilder methods (e.g. `createParallel`) expect all the region code to be emitted withing a callback. In the case of `createParallel`, the region code is getting outlined into a new function. Therefore, making the operation requirement an IR order would not make the implementation any easier. --- Patch is 21.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147069.diff 6 Files Affected: - (modified) mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h (+43) - (modified) mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp (+10) - (modified) mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+78) - (added) mlir/test/Target/LLVMIR/openmp-cli-canonical_loop.mlir (+175) - (added) mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic01.mlir (+56) - (added) mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic02.mlir (+93) ``diff diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h index 79e8bb6add0da..5d52cf3f04b6a 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h @@ -15,6 +15,7 @@ #define MLIR_TARGET_LLVMIR_MODULETRANSLATION_H #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/Operation.h" #include "mlir/IR/SymbolTable.h" #include "mlir/IR/Value.h" @@ -24,6 +25,7 @@ #include "mlir/Target/LLVMIR/TypeToLLVM.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/FPEnv.h" namespace llvm { @@ -108,6 +110,41 @@ class ModuleTranslation { return blockMapping.lookup(block); } + /// Find the LLVM-IR loop that represents an MLIR loop. + llvm::CanonicalLoopInfo *lookupOMPLoop(omp::NewCliOp mlir) const { +llvm::CanonicalLoopInfo *result = loopMapping.lookup(mlir); +assert(result && "attempt to get non-existing loop"); +return result; + } + + /// Find the LLVM-IR loop that represents an MLIR loop. + llvm::CanonicalLoopInfo *lookupOMPLoop(Value mlir) const { +return lookupOMPLoop(mlir.getDefiningOp()); + } + + /// Mark an OpenMP loop as having been consumed. + void invalidateOmpLoop(omp::NewCliOp mlir) { loopMapping.erase(mlir); } + + /// Mark an OpenMP loop as having been consumed. + void invalidateOmpLoop(Value mlir) { +invalidateOmpLoop(mlir.getDefiningOp()); + } + + /// Map an MLIR OpenMP dialect CanonicalLoopInfo to its lowered LLVM-IR + /// OpenMPIRBuilder CanonicalLoopInfo + void mapOmpLoop(omp::NewCliOp mlir, llvm::CanonicalLoopInfo *llvm) { +assert(llvm && "argument must be non-null"); +llvm::CanonicalLoopInfo *&cur = loopMapping[mlir]; +assert(cur == nullptr && "attempting to map a loop that is already mapped"); +cur = llvm; + } + + /// Map an MLIR OpenMP dialect CanonicalLoopInfo to its lowered LLVM-IR + /// OpenMPIRBuilder CanonicalLoopInfo + void mapOmpLoop(Value mlir, llvm::CanonicalLoopInfo *llvm) { +mapOmpLoop(mlir.getDefiningOp(), llvm); + } + /// Stores the mapping between an MLIR operation with successors and a /// corresponding LLVM IR instruction. void mapBranch(Operation *mlir, llvm::Instruction *llvm) { @@ -381,6 +418,12 @@ class ModuleTranslation { DenseMap valueMapping; DenseMap blockMapping; + /// List of not yet consumed MLIR loop handles (represented by an omp.new_cli + /// operation which creates a value of type CanonicalLoopInfoType) and their + /// LLVM-IR representation as CanonicalLoopInfo which is managed by the + /// OpenMPIRBuilder. + DenseMap loopMapping; + /// A mapping between MLIR LLVM dialect terminators and LLVM IR terminators /// they are converted to. This allows for connecting PHI nodes to the source /// values after all operations are converted. diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp index 77a2708653576..7ac9687c4eeda 100644 --- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp +++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp @@ -41,6 +41,16 @@ template struct OpenMPOpConversion : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Add canonical loop LLVM-IR lowering (PR #147069)
https://github.com/Meinersbur ready_for_review https://github.com/llvm/llvm-project/pull/147069 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [LifetimeSafety] Propagate loans using dataflow analysis (PR #147295)
https://github.com/usx95 created https://github.com/llvm/llvm-project/pull/147295 None >From 2e4261b02b6230a8c79f01a673cc3030cfff3ea7 Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Sun, 6 Jul 2025 19:12:55 + Subject: [PATCH 1/6] [LifetimeSafety] Propagate loans using dataflow analysis --- clang/lib/Analysis/LifetimeSafety.cpp | 255 +- .../Sema/warn-lifetime-safety-dataflow.cpp| 186 + 2 files changed, 440 insertions(+), 1 deletion(-) diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp index 3fe30e36ebd0f..7870352f0287a 100644 --- a/clang/lib/Analysis/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety.cpp @@ -491,7 +491,247 @@ class FactGenerator : public ConstStmtVisitor { }; // = // -// TODO: Run dataflow analysis to propagate loans, analyse and error reporting. +// The Dataflow Lattice +// = // + +// Using LLVM's immutable collections is efficient for dataflow analysis +// as it avoids deep copies during state transitions. +// TODO(opt): Consider using a bitset to represent the set of loans. +using LoanSet = llvm::ImmutableSet; +using OriginLoanMap = llvm::ImmutableMap; + +/// An object to hold the factories for immutable collections, ensuring +/// that all created states share the same underlying memory management. +struct LifetimeFactory { + OriginLoanMap::Factory OriginMapFact; + LoanSet::Factory LoanSetFact; + + LoanSet createLoanSet(LoanID LID) { +return LoanSetFact.add(LoanSetFact.getEmptySet(), LID); + } +}; + +/// LifetimeLattice represents the state of our analysis at a given program +/// point. It is an immutable object, and all operations produce a new +/// instance rather than modifying the existing one. +struct LifetimeLattice { + /// The map from an origin to the set of loans it contains. + /// TODO(opt): To reduce the lattice size, propagate origins of declarations, + /// not expressions, because expressions are not visible across blocks. + OriginLoanMap Origins = OriginLoanMap(nullptr); + + explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {} + LifetimeLattice() = default; + + bool operator==(const LifetimeLattice &Other) const { +return Origins == Other.Origins; + } + bool operator!=(const LifetimeLattice &Other) const { +return !(*this == Other); + } + + LoanSet getLoans(OriginID OID, LifetimeFactory &Factory) const { +if (auto *Loans = Origins.lookup(OID)) + return *Loans; +return Factory.LoanSetFact.getEmptySet(); + } + + /// Computes the union of two lattices by performing a key-wise join of + /// their OriginLoanMaps. + // TODO(opt): This key-wise join is a performance bottleneck. A more + // efficient merge could be implemented using a Patricia Trie or HAMT + // instead of the current AVL-tree-based ImmutableMap. + LifetimeLattice join(const LifetimeLattice &Other, + LifetimeFactory &Factory) const { +/// Merge the smaller map into the larger one ensuring we iterate over the +/// smaller map. +if (Origins.getHeight() < Other.Origins.getHeight()) + return Other.join(*this, Factory); + +OriginLoanMap JoinedState = Origins; +// For each origin in the other map, union its loan set with ours. +for (const auto &Entry : Other.Origins) { + OriginID OID = Entry.first; + LoanSet OtherLoanSet = Entry.second; + JoinedState = Factory.OriginMapFact.add( + JoinedState, OID, + join(getLoans(OID, Factory), OtherLoanSet, Factory)); +} +return LifetimeLattice(JoinedState); + } + + LoanSet join(LoanSet a, LoanSet b, LifetimeFactory &Factory) const { +/// Merge the smaller set into the larger one ensuring we iterate over the +/// smaller set. +if (a.getHeight() < b.getHeight()) + std::swap(a, b); +LoanSet Result = a; +for (LoanID LID : b) { + /// TODO(opt): Profiling shows that this loop is a major performance + /// bottleneck. Investigate using a BitVector to represent the set of + /// loans for improved join performance. + Result = Factory.LoanSetFact.add(Result, LID); +} +return Result; + } + + void dump(llvm::raw_ostream &OS) const { +OS << "LifetimeLattice State:\n"; +if (Origins.isEmpty()) + OS << " \n"; +for (const auto &Entry : Origins) { + if (Entry.second.isEmpty()) +OS << " Origin " << Entry.first << " contains no loans\n"; + for (const LoanID &LID : Entry.second) +OS << " Origin " << Entry.first << " contains Loan " << LID << "\n"; +} + } +}; + +// = // +// The Transfer Function +// = /
[llvm-branch-commits] [clang] [LifetimeSafety] Propagate loans using dataflow analysis (PR #147295)
https://github.com/usx95 updated https://github.com/llvm/llvm-project/pull/147295 >From 2e4261b02b6230a8c79f01a673cc3030cfff3ea7 Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Sun, 6 Jul 2025 19:12:55 + Subject: [PATCH 1/6] [LifetimeSafety] Propagate loans using dataflow analysis --- clang/lib/Analysis/LifetimeSafety.cpp | 255 +- .../Sema/warn-lifetime-safety-dataflow.cpp| 186 + 2 files changed, 440 insertions(+), 1 deletion(-) diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp index 3fe30e36ebd0f..7870352f0287a 100644 --- a/clang/lib/Analysis/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety.cpp @@ -491,7 +491,247 @@ class FactGenerator : public ConstStmtVisitor { }; // = // -// TODO: Run dataflow analysis to propagate loans, analyse and error reporting. +// The Dataflow Lattice +// = // + +// Using LLVM's immutable collections is efficient for dataflow analysis +// as it avoids deep copies during state transitions. +// TODO(opt): Consider using a bitset to represent the set of loans. +using LoanSet = llvm::ImmutableSet; +using OriginLoanMap = llvm::ImmutableMap; + +/// An object to hold the factories for immutable collections, ensuring +/// that all created states share the same underlying memory management. +struct LifetimeFactory { + OriginLoanMap::Factory OriginMapFact; + LoanSet::Factory LoanSetFact; + + LoanSet createLoanSet(LoanID LID) { +return LoanSetFact.add(LoanSetFact.getEmptySet(), LID); + } +}; + +/// LifetimeLattice represents the state of our analysis at a given program +/// point. It is an immutable object, and all operations produce a new +/// instance rather than modifying the existing one. +struct LifetimeLattice { + /// The map from an origin to the set of loans it contains. + /// TODO(opt): To reduce the lattice size, propagate origins of declarations, + /// not expressions, because expressions are not visible across blocks. + OriginLoanMap Origins = OriginLoanMap(nullptr); + + explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {} + LifetimeLattice() = default; + + bool operator==(const LifetimeLattice &Other) const { +return Origins == Other.Origins; + } + bool operator!=(const LifetimeLattice &Other) const { +return !(*this == Other); + } + + LoanSet getLoans(OriginID OID, LifetimeFactory &Factory) const { +if (auto *Loans = Origins.lookup(OID)) + return *Loans; +return Factory.LoanSetFact.getEmptySet(); + } + + /// Computes the union of two lattices by performing a key-wise join of + /// their OriginLoanMaps. + // TODO(opt): This key-wise join is a performance bottleneck. A more + // efficient merge could be implemented using a Patricia Trie or HAMT + // instead of the current AVL-tree-based ImmutableMap. + LifetimeLattice join(const LifetimeLattice &Other, + LifetimeFactory &Factory) const { +/// Merge the smaller map into the larger one ensuring we iterate over the +/// smaller map. +if (Origins.getHeight() < Other.Origins.getHeight()) + return Other.join(*this, Factory); + +OriginLoanMap JoinedState = Origins; +// For each origin in the other map, union its loan set with ours. +for (const auto &Entry : Other.Origins) { + OriginID OID = Entry.first; + LoanSet OtherLoanSet = Entry.second; + JoinedState = Factory.OriginMapFact.add( + JoinedState, OID, + join(getLoans(OID, Factory), OtherLoanSet, Factory)); +} +return LifetimeLattice(JoinedState); + } + + LoanSet join(LoanSet a, LoanSet b, LifetimeFactory &Factory) const { +/// Merge the smaller set into the larger one ensuring we iterate over the +/// smaller set. +if (a.getHeight() < b.getHeight()) + std::swap(a, b); +LoanSet Result = a; +for (LoanID LID : b) { + /// TODO(opt): Profiling shows that this loop is a major performance + /// bottleneck. Investigate using a BitVector to represent the set of + /// loans for improved join performance. + Result = Factory.LoanSetFact.add(Result, LID); +} +return Result; + } + + void dump(llvm::raw_ostream &OS) const { +OS << "LifetimeLattice State:\n"; +if (Origins.isEmpty()) + OS << " \n"; +for (const auto &Entry : Origins) { + if (Entry.second.isEmpty()) +OS << " Origin " << Entry.first << " contains no loans\n"; + for (const LoanID &LID : Entry.second) +OS << " Origin " << Entry.first << " contains Loan " << LID << "\n"; +} + } +}; + +// = // +// The Transfer Function +// = // +cla
[llvm-branch-commits] [llvm] AtomicExpand: Stop using report_fatal_error (PR #147300)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/147300 Emit a context error and delete the instruction. This allows removing the AMDGPU hack where some atomic libcalls are falsely added. NVPTX also later copied the same hack, so remove it there too. For now just emit the generic error, which is not good. It's missing any useful context information (despite taking the instruction). It's also confusing in the failed atomicrmw case, since it's reporting failure at the intermediate failed cmpxchg instead of the original atomicrmw. >From 4d46f60b03774704354e98ccea89d4c622c7d300 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 7 Jul 2025 21:25:22 +0900 Subject: [PATCH] AtomicExpand: Stop using report_fatal_error Emit a context error and delete the instruction. This allows removing the AMDGPU hack where some atomic libcalls are falsely added. NVPTX also later copied the same hack, so remove it there too. For now just emit the generic error, which is not good. It's missing any useful context information (despite taking the instruction). It's also confusing in the failed atomicrmw case, since it's reporting failure at the intermediate failed cmpxchg instead of the original atomicrmw. --- llvm/lib/CodeGen/AtomicExpandPass.cpp | 17 +- llvm/lib/IR/RuntimeLibcalls.cpp | 6 +- llvm/test/CodeGen/AMDGPU/atomic-oversize.ll | 10 - .../CodeGen/AMDGPU/unsupported-atomics.ll | 55 ++ .../CodeGen/NVPTX/atomicrmw-expand.err.ll | 27 + llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll | 28 - .../AMDGPU/expand-atomic-fp128.ll | 122 --- .../AtomicExpand/AMDGPU/expand-atomic-i128.ll | 201 - .../AtomicExpand/AMDGPU/expand-atomic-mmra.ll | 25 - .../AMDGPU/expand-atomicrmw-fp-vector.ll | 752 +- .../AtomicExpand/AMDGPU/unaligned-atomic.ll | 22 +- .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 19 +- 12 files changed, 106 insertions(+), 1178 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/atomic-oversize.ll create mode 100644 llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll create mode 100644 llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll delete mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-fp128.ll delete mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i128.ll diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 044f0732779f3..44295b44482e7 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -65,6 +65,17 @@ class AtomicExpandImpl { const DataLayout *DL = nullptr; private: + void handleFailure(Instruction &FailedInst, const Twine &Msg) const { +LLVMContext &Ctx = FailedInst.getContext(); + +// TODO: Do not use generic error type +Ctx.emitError(&FailedInst, Msg); + +if (!FailedInst.getType()->isVoidTy()) + FailedInst.replaceAllUsesWith(PoisonValue::get(FailedInst.getType())); +FailedInst.eraseFromParent(); + } + bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); @@ -1744,7 +1755,7 @@ void AtomicExpandImpl::expandAtomicLoadToLibcall(LoadInst *I) { I, Size, I->getAlign(), I->getPointerOperand(), nullptr, nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); if (!expanded) -report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Load"); +handleFailure(*I, "unsupported atomic load"); } void AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) { @@ -1757,7 +1768,7 @@ void AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) { I, Size, I->getAlign(), I->getPointerOperand(), I->getValueOperand(), nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); if (!expanded) -report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Store"); +handleFailure(*I, "unsupported atomic store"); } void AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) { @@ -1772,7 +1783,7 @@ void AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) { I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(), Libcalls); if (!expanded) -report_fatal_error("expandAtomicOpToLibcall shouldn't fail for CAS"); +handleFailure(*I, "unsupported cmpxchg"); } static ArrayRef GetRMWLibcall(AtomicRMWInst::BinOp Op) { diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 712f1a48d0b7b..b21504037be8f 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -455,10 +455,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, // Disable most libcalls on AMDGPU and NVPTX. if (TT.isAMDGPU() || TT.isNVPTX()) { -for (RTLIB::Libcall LC : RTLIB::libcalls()) { - if (!isAtomicLibCall(LC)) -
[llvm-branch-commits] [llvm] AtomicExpand: Stop using report_fatal_error (PR #147300)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes Emit a context error and delete the instruction. This allows removing the AMDGPU hack where some atomic libcalls are falsely added. NVPTX also later copied the same hack, so remove it there too. For now just emit the generic error, which is not good. It's missing any useful context information (despite taking the instruction). It's also confusing in the failed atomicrmw case, since it's reporting failure at the intermediate failed cmpxchg instead of the original atomicrmw. --- Patch is 92.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147300.diff 12 Files Affected: - (modified) llvm/lib/CodeGen/AtomicExpandPass.cpp (+14-3) - (modified) llvm/lib/IR/RuntimeLibcalls.cpp (+2-4) - (removed) llvm/test/CodeGen/AMDGPU/atomic-oversize.ll (-10) - (added) llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll (+55) - (added) llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll (+27) - (modified) llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll (-28) - (removed) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-fp128.ll (-122) - (removed) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i128.ll (-201) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll (-25) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll (+4-748) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll (+3-19) - (modified) llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp (+1-18) ``diff diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 044f0732779f3..44295b44482e7 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -65,6 +65,17 @@ class AtomicExpandImpl { const DataLayout *DL = nullptr; private: + void handleFailure(Instruction &FailedInst, const Twine &Msg) const { +LLVMContext &Ctx = FailedInst.getContext(); + +// TODO: Do not use generic error type +Ctx.emitError(&FailedInst, Msg); + +if (!FailedInst.getType()->isVoidTy()) + FailedInst.replaceAllUsesWith(PoisonValue::get(FailedInst.getType())); +FailedInst.eraseFromParent(); + } + bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); @@ -1744,7 +1755,7 @@ void AtomicExpandImpl::expandAtomicLoadToLibcall(LoadInst *I) { I, Size, I->getAlign(), I->getPointerOperand(), nullptr, nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); if (!expanded) -report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Load"); +handleFailure(*I, "unsupported atomic load"); } void AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) { @@ -1757,7 +1768,7 @@ void AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) { I, Size, I->getAlign(), I->getPointerOperand(), I->getValueOperand(), nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); if (!expanded) -report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Store"); +handleFailure(*I, "unsupported atomic store"); } void AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) { @@ -1772,7 +1783,7 @@ void AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) { I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(), Libcalls); if (!expanded) -report_fatal_error("expandAtomicOpToLibcall shouldn't fail for CAS"); +handleFailure(*I, "unsupported cmpxchg"); } static ArrayRef GetRMWLibcall(AtomicRMWInst::BinOp Op) { diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 712f1a48d0b7b..b21504037be8f 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -455,10 +455,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, // Disable most libcalls on AMDGPU and NVPTX. if (TT.isAMDGPU() || TT.isNVPTX()) { -for (RTLIB::Libcall LC : RTLIB::libcalls()) { - if (!isAtomicLibCall(LC)) -setLibcallImpl(LC, RTLIB::Unsupported); -} +for (RTLIB::Libcall LC : RTLIB::libcalls()) + setLibcallImpl(LC, RTLIB::Unsupported); } if (TT.isOSMSVCRT()) { diff --git a/llvm/test/CodeGen/AMDGPU/atomic-oversize.ll b/llvm/test/CodeGen/AMDGPU/atomic-oversize.ll deleted file mode 100644 index f62a93f523365..0 --- a/llvm/test/CodeGen/AMDGPU/atomic-oversize.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s - -define void @test(ptr %a) nounwind { -; CHECK-LABEL: test: -; CHECK: __atomic_load_16 -; CHECK: __atomic_store_16 - %1 = load atomic i128, ptr %a seq_cst, align 16 - store atomic i128 %1, ptr %a seq_cst, align
[llvm-branch-commits] [llvm] AtomicExpand: Stop using report_fatal_error (PR #147300)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/147300?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#147300** https://app.graphite.dev/github/pr/llvm/llvm-project/147300?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/147300?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#147299** https://app.graphite.dev/github/pr/llvm/llvm-project/147299?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/147300 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LV] Bundle partial reductions inside VPExpressionRecipe (PR #147302)
llvmbot wrote: @llvm/pr-subscribers-backend-aarch64 Author: Sam Tebbs (SamTebbs33) Changes This PR bundles partial reductions inside the VPExpressionRecipe class. Depends on https://github.com/llvm/llvm-project/pull/147255 . --- Patch is 202.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147302.diff 16 Files Affected: - (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+2) - (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+15-4) - (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+1-1) - (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+6-2) - (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+23) - (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+7-4) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll (+55-35) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll (+2-2) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll (+98-98) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll (+42-42) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll (+299-279) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll (+14-22) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll (+12-12) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll (+11-20) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll (+4-7) - (modified) llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll (+26-26) ``diff diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 3cc0ea01953c3..338599a9bb5aa 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -223,6 +223,8 @@ class TargetTransformInfo { /// Get the kind of extension that an instruction represents. LLVM_ABI static PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I); + LLVM_ABI static PartialReductionExtendKind + getPartialReductionExtendKind(Instruction::CastOps CastOpc); /// Construct a TTI object using a type implementing the \c Concept /// API below. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index ba0d070bffe6d..5e9733a264e22 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1001,13 +1001,24 @@ InstructionCost TargetTransformInfo::getShuffleCost( TargetTransformInfo::PartialReductionExtendKind TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) { - if (isa(I)) -return PR_SignExtend; - if (isa(I)) -return PR_ZeroExtend; + if (auto *Cast = dyn_cast(I)) +return getPartialReductionExtendKind(Cast->getOpcode()); return PR_None; } +TargetTransformInfo::PartialReductionExtendKind +TargetTransformInfo::getPartialReductionExtendKind( +Instruction::CastOps CastOpc) { + switch (CastOpc) { + case Instruction::CastOps::ZExt: +return PR_ZeroExtend; + case Instruction::CastOps::SExt: +return PR_SignExtend; + default: +return PR_None; + } +} + TTI::CastContextHint TargetTransformInfo::getCastContextHint(const Instruction *I) { if (!I) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index d9a367535baf4..5021a490839b2 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5294,7 +5294,7 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost( EVT ResVT = TLI->getValueType(DL, ResTy); if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() && - VecVT.getSizeInBits() >= 64) { + VecVT.isFixedLengthVector() && VecVT.getSizeInBits() >= 64) { std::pair LT = getTypeLegalizationCost(VecTy); // The legal cases are: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 1bc926db301d8..30f3566332d79 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2470,7 +2470,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionSC || - R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || + R->getVPDefID() == VPRecipeBase::VPPartialReductionSC; } static inline bool classof(const VPUser *U) { @@ -2532,7 +2533,10 @@ class VPPartialReductionRecipe : public VPReductionRecipe { Opcode(Opcode), VFScaleFactor(ScaleFactor) { [
[llvm-branch-commits] [llvm] AtomicExpand: Stop using report_fatal_error (PR #147300)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/147300 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] MSP430: Move libcall CC setting to RuntimeLibcallsInfo (PR #146081)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/146081 >From ca7e199c05935ba53568fe96520acce04b5727c1 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 23 Jun 2025 16:35:14 +0900 Subject: [PATCH] MSP430: Move libcall CC setting to RuntimeLibcallsInfo As a temporary step configure the calling convention here. This can't be moved into tablegen until RuntimeLibcallsInfo is split into a separate lowering component. --- llvm/lib/IR/RuntimeLibcalls.cpp | 5 + llvm/lib/Target/MSP430/MSP430ISelLowering.cpp | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 1376ffc7c7293..8c3257147213d 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -364,6 +364,11 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, RTLIB::HEXAGON_MEMCPY_LIKELY_ALIGNED_MIN32BYTES_MULT8BYTES, RTLIB::__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes); } + + if (TT.getArch() == Triple::ArchType::msp430) { +setLibcallImplCallingConv(RTLIB::__mspabi_mpyll, + CallingConv::MSP430_BUILTIN); + } } bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) { diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp index 20d1781946f0f..d23504c203dd3 100644 --- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -208,8 +208,6 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, for (const auto &LC : LibraryCalls) { setLibcallImpl(LC.Op, LC.Impl); } -setLibcallImplCallingConv(RTLIB::__mspabi_mpyll, - CallingConv::MSP430_BUILTIN); } setMinFunctionAlignment(Align(2)); ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] MSP430: Move libcall CC setting to RuntimeLibcallsInfo (PR #146081)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/146081 >From ca7e199c05935ba53568fe96520acce04b5727c1 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 23 Jun 2025 16:35:14 +0900 Subject: [PATCH] MSP430: Move libcall CC setting to RuntimeLibcallsInfo As a temporary step configure the calling convention here. This can't be moved into tablegen until RuntimeLibcallsInfo is split into a separate lowering component. --- llvm/lib/IR/RuntimeLibcalls.cpp | 5 + llvm/lib/Target/MSP430/MSP430ISelLowering.cpp | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 1376ffc7c7293..8c3257147213d 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -364,6 +364,11 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, RTLIB::HEXAGON_MEMCPY_LIKELY_ALIGNED_MIN32BYTES_MULT8BYTES, RTLIB::__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes); } + + if (TT.getArch() == Triple::ArchType::msp430) { +setLibcallImplCallingConv(RTLIB::__mspabi_mpyll, + CallingConv::MSP430_BUILTIN); + } } bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) { diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp index 20d1781946f0f..d23504c203dd3 100644 --- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -208,8 +208,6 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, for (const auto &LC : LibraryCalls) { setLibcallImpl(LC.Op, LC.Impl); } -setLibcallImplCallingConv(RTLIB::__mspabi_mpyll, - CallingConv::MSP430_BUILTIN); } setMinFunctionAlignment(Align(2)); ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Lanai: Use TableGen to set libcall calling conventions (PR #146080)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/146080 >From a492a7e695c2f543b6caba066f5d8beb4272b8cf Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 23 Jun 2025 16:17:26 +0900 Subject: [PATCH] Lanai: Use TableGen to set libcall calling conventions --- llvm/include/llvm/IR/RuntimeLibcalls.td | 12 llvm/lib/Target/Lanai/LanaiISelLowering.cpp | 4 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index f8667269ec8cb..c15ffa0653335 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -1389,6 +1389,18 @@ def __hexagon_fast2_sqrtdf2 : RuntimeLibcallImpl; def __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes : RuntimeLibcallImpl; +//===--===// +// Lanai Runtime Libcalls +//===--===// + +def isLanai : RuntimeLibcallPredicate<"TT.getArch() == Triple::lanai">; + +// Use fast calling convention for library functions. +def LanaiSystemLibrary +: SystemRuntimeLibrary { + let DefaultLibcallCallingConv = FASTCC; +} + //===--===// // Mips16 Runtime Libcalls //===--===// diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp index 6fb73c5d18966..d23c5f43ad4ff 100644 --- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp @@ -150,10 +150,6 @@ LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM, // statements. Re-evaluate this on new benchmarks. setMinimumJumpTableEntries(100); - // Use fast calling convention for library functions. - for (RTLIB::LibcallImpl LC : RTLIB::libcall_impls()) -setLibcallImplCallingConv(LC, CallingConv::Fast); - MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] ARM: Start moving runtime libcalls into tablegen (PR #146084)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/146084 >From 8eb2e09e5f533bbf706445437d7cf5590d775fab Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 23 Jun 2025 23:23:01 +0900 Subject: [PATCH] ARM: Start moving runtime libcalls into tablegen We still need to manually set the calling conventions of some libcalls until the lowering is separated out. --- llvm/include/llvm/IR/RuntimeLibcalls.h| 2 +- llvm/include/llvm/IR/RuntimeLibcalls.td | 48 llvm/lib/IR/RuntimeLibcalls.cpp | 73 +-- .../RuntimeLibcallEmitter-calling-conv.td | 2 +- llvm/test/TableGen/RuntimeLibcallEmitter.td | 2 +- .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 2 +- 6 files changed, 53 insertions(+), 76 deletions(-) diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index ac83df3a4189e..4ea5ff9c9ade8 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -170,7 +170,7 @@ struct RuntimeLibcallsInfo { void initDefaultLibCallImpls(); /// Generated by tablegen. - void setTargetRuntimeLibcallSets(const Triple &TT); + void setTargetRuntimeLibcallSets(const Triple &TT, FloatABI::ABIType FloatABI); /// Set default libcall names. If a target wants to opt-out of a libcall it /// should be placed here. diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index c15ffa0653335..0237c8b41ae8c 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -17,6 +17,7 @@ class DuplicateLibcallImplWithPrefix /// Libcall Predicates def isOSDarwin : RuntimeLibcallPredicate<"TT.isOSDarwin()">; +def isOSWindows : RuntimeLibcallPredicate<"TT.isOSWindows()">; def darwinHasSinCosStret : RuntimeLibcallPredicate<"darwinHasSinCosStret(TT)">; def darwinHasExp10 : RuntimeLibcallPredicate<"darwinHasExp10(TT)">; @@ -1272,6 +1273,7 @@ def __aeabi_memclr4 : RuntimeLibcallImpl; def __aeabi_memclr8 : RuntimeLibcallImpl; // isTargetWindows() +defset list WindowsFPIntCastLibcalls = { def __stoi64 : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_VFP def __dtoi64 : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_VFP def __stou64 : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_VFP @@ -1280,6 +1282,7 @@ def __i64tos : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_V def __i64tod : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_VFP def __u64tos : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_VFP def __u64tod : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_VFP +} def __rt_sdiv : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS def __rt_sdiv64 : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS @@ -1306,6 +1309,51 @@ def __aeabi_h2f : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS def __gnu_f2h_ieee : RuntimeLibcallImpl; def __gnu_h2f_ieee : RuntimeLibcallImpl; + +def WindowARMDivRemCalls : LibcallImpls< + (add __rt_sdiv, __rt_sdiv64, __rt_udiv, __rt_udiv64), + isOSWindows> { + let CallingConv = ARM_AAPCS; +} + +def WindowARMFPIntCasts : LibcallImpls< + (add WindowsFPIntCastLibcalls), + isOSWindows> { + let CallingConv = ARM_AAPCS_VFP; +} + + +// Register based DivRem for AEABI (RTABI 4.2) +def AEABIDivRemCalls : LibcallImpls< + (add __aeabi_idivmod, __aeabi_ldivmod, + __aeabi_uidivmod, __aeabi_uldivmod), + RuntimeLibcallPredicate<[{TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() || +TT.isTargetMuslAEABI()}]>> { + let CallingConv = ARM_AAPCS; +} + +def isARMOrThumb : RuntimeLibcallPredicate<"TT.isARM() || TT.isThumb()">; + +def ARMSystemLibrary +: SystemRuntimeLibrary>)> { + let DefaultLibcallCallingConv = LibcallCallingConv<[{ + (!TT.isOSDarwin() && !TT.isiOS() && !TT.isWatchOS() && !TT.isDriverKit()) ? +(FloatABI == FloatABI::Hard ? CallingConv::ARM_AAPCS_VFP +: CallingConv::ARM_AAPCS) : + CallingConv::C + }]>; +} + //===--===// // AVR Runtime Libcalls //===--===// diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index c2d0b0684ec39..2d168befd145c 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -25,77 +25,6 @@ static cl::opt static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT, FloatABI::ABIType FloatABIType, EABI EABIVersion) { - if (!TT.isOSDarwin() && !TT.isiOS() && !TT.isWatchOS() && !TT.isDriverKit()) { -CallingConv::ID DefaultCC = FloatABIType == FloatABI::Hard -? CallingConv::ARM_AAPCS_VFP -: CallingConv::ARM_AAPCS; -for (RTLIB::LibcallImpl LC : RTLIB::libcall_i
[llvm-branch-commits] [llvm] TableGen: Handle setting runtime libcall calling conventions (PR #144980)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/144980 >From 414f451ba5714d7aef14c2fdd7f95fc7f7d8be19 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 13 Jun 2025 15:54:41 +0900 Subject: [PATCH] TableGen: Handle setting runtime libcall calling conventions Allow associating a non-default CallingConv with a set of library functions, and applying a default for a SystemLibrary. I also wanted to be able to apply a default calling conv to a RuntimeLibcallImpl, but that turned out to be annoying so leave it for later. --- llvm/include/llvm/IR/RuntimeLibcalls.td | 140 +-- llvm/include/llvm/IR/RuntimeLibcallsImpl.td | 27 ++- llvm/lib/IR/RuntimeLibcalls.cpp | 141 --- .../RuntimeLibcallEmitter-calling-conv.td | 128 ++ llvm/test/TableGen/RuntimeLibcallEmitter.td | 19 +- .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 163 ++ 6 files changed, 429 insertions(+), 189 deletions(-) create mode 100644 llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index e6fffa2c7f933..f8667269ec8cb 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -1311,11 +1311,12 @@ def __gnu_h2f_ieee : RuntimeLibcallImpl; //===--===// // Several of the runtime library functions use a special calling conv -def __divmodqi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN -def __divmodhi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN +def __divmodqi4 : RuntimeLibcallImpl; +def __divmodhi4 : RuntimeLibcallImpl; +def __udivmodqi4 : RuntimeLibcallImpl; +def __udivmodhi4 : RuntimeLibcallImpl; + //def __divmodsi4 : RuntimeLibcallImpl; -def __udivmodqi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN -def __udivmodhi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN //def __udivmodsi4 : RuntimeLibcallImpl; // Standard sinf/cosf name replaced with "sin" and "cos". Define a @@ -1341,9 +1342,12 @@ def AVRSystemLibrary // Standard f64 names are replaced sin, cos, sinf, cosf), - __divmodqi4, __divmodhi4, __divmodsi4, __udivmodqi4, __udivmodhi4, - __udivmodsi4, - + // Several of the runtime library functions use a special calling + // conv + LibcallsWithCC<(add __divmodqi4, __divmodhi4, __udivmodqi4, + __udivmodhi4), + AVR_BUILTIN>, + __divmodsi4, __udivmodsi4, // Trigonometric rtlib functions avr_sin, avr_cos)>; @@ -1566,6 +1570,117 @@ def __mspabi_mpyll : RuntimeLibcallImpl; // setLibcallCallingConv(MUL_I64, CallingConv::MSP430_BUILTIN); +def isMSP430 : RuntimeLibcallPredicate<"TT.getArch() == Triple::msp430">; + +defvar MSP430DefaultOptOut = [ + __addsf3, __divsf3, __extendsfdf2, __truncdfsf2, __fixsfsi, + __fixsfdi, __fixunssfsi, __mulsf3, __eqsf2, __gesf2, __gtsf2, + __divhi3, __divsi3, __ashlsi3, __floatsidf, __floatsisf, + __ashrsi3, __modhi3, __udivsi3, __fixdfsi, __fixunssfdi, + __udivhi3, __umodsi3, __nesf2, __lesf2, __floatundisf, + __fixdfdi, __fixunsdfsi, __modsi3, __floatunsisf, + __fixunsdfdi, __ltsf2, __floatdisf, __floatdidf, + __lshrsi3, __subsf3, __umodhi3, __floatunsidf, + __floatundidf +]; + +// EABI Libcalls - EABI Section 6.2 +def MSP430SystemLibrary +: SystemRuntimeLibrary, + __mspabi_cmpf__oeq, + __mspabi_cmpf__une, + __mspabi_cmpf__oge, + __mspabi_cmpf__olt, + __mspabi_cmpf__ole, + __mspabi_cmpf__ogt, + + // Floating point arithmetic - EABI Table 8 + LibcallsWithCC<(add __mspabi_addd, + __mspabi_subd, + __mspabi_mpyd, + __mspabi_divd), MSP430_BUILTIN>, + + __mspabi_addf, + __mspabi_subf, + __mspabi_mpyf, + __mspabi_divf, + + // The following are NOT implemented in libgcc + // __mspabi_negd, + // __mspabi_negf, + + // Universal Integer Operations - EABI Table 9 + __mspabi_divi, + __mspabi_divli, + LibcallsWithCC<(add __mspabi_divlli), MSP430_BUILTIN>, + __mspabi_divu, + __mspabi_divul, + LibcallsWithCC<(add __mspabi_divull), MSP430_BUILTIN>, + __mspabi_remi, + __mspabi_remli, + LibcallsWithCC<(add __mspabi_remlli), MSP430_BUILTIN>, + __mspabi_remu, + __mspabi_remul, + LibcallsWithCC<(add __mspabi_remull), MSP430_BUILTIN>, + + // Bitwise Operations - EABI Table 10 + // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc + __mspabi_srll, + __mspabi_sral, + __mspabi_slll + // __mspabi_[srlll/srall/s/rlli/rlll] are NOT implemented in libgcc + ) +>; + //===--===// // NVPTX Runtime Libcalls //===---
[llvm-branch-commits] [llvm] TableGen: Handle setting runtime libcall calling conventions (PR #144980)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/144980 >From 414f451ba5714d7aef14c2fdd7f95fc7f7d8be19 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 13 Jun 2025 15:54:41 +0900 Subject: [PATCH] TableGen: Handle setting runtime libcall calling conventions Allow associating a non-default CallingConv with a set of library functions, and applying a default for a SystemLibrary. I also wanted to be able to apply a default calling conv to a RuntimeLibcallImpl, but that turned out to be annoying so leave it for later. --- llvm/include/llvm/IR/RuntimeLibcalls.td | 140 +-- llvm/include/llvm/IR/RuntimeLibcallsImpl.td | 27 ++- llvm/lib/IR/RuntimeLibcalls.cpp | 141 --- .../RuntimeLibcallEmitter-calling-conv.td | 128 ++ llvm/test/TableGen/RuntimeLibcallEmitter.td | 19 +- .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 163 ++ 6 files changed, 429 insertions(+), 189 deletions(-) create mode 100644 llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index e6fffa2c7f933..f8667269ec8cb 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -1311,11 +1311,12 @@ def __gnu_h2f_ieee : RuntimeLibcallImpl; //===--===// // Several of the runtime library functions use a special calling conv -def __divmodqi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN -def __divmodhi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN +def __divmodqi4 : RuntimeLibcallImpl; +def __divmodhi4 : RuntimeLibcallImpl; +def __udivmodqi4 : RuntimeLibcallImpl; +def __udivmodhi4 : RuntimeLibcallImpl; + //def __divmodsi4 : RuntimeLibcallImpl; -def __udivmodqi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN -def __udivmodhi4 : RuntimeLibcallImpl; // CallingConv::AVR_BUILTIN //def __udivmodsi4 : RuntimeLibcallImpl; // Standard sinf/cosf name replaced with "sin" and "cos". Define a @@ -1341,9 +1342,12 @@ def AVRSystemLibrary // Standard f64 names are replaced sin, cos, sinf, cosf), - __divmodqi4, __divmodhi4, __divmodsi4, __udivmodqi4, __udivmodhi4, - __udivmodsi4, - + // Several of the runtime library functions use a special calling + // conv + LibcallsWithCC<(add __divmodqi4, __divmodhi4, __udivmodqi4, + __udivmodhi4), + AVR_BUILTIN>, + __divmodsi4, __udivmodsi4, // Trigonometric rtlib functions avr_sin, avr_cos)>; @@ -1566,6 +1570,117 @@ def __mspabi_mpyll : RuntimeLibcallImpl; // setLibcallCallingConv(MUL_I64, CallingConv::MSP430_BUILTIN); +def isMSP430 : RuntimeLibcallPredicate<"TT.getArch() == Triple::msp430">; + +defvar MSP430DefaultOptOut = [ + __addsf3, __divsf3, __extendsfdf2, __truncdfsf2, __fixsfsi, + __fixsfdi, __fixunssfsi, __mulsf3, __eqsf2, __gesf2, __gtsf2, + __divhi3, __divsi3, __ashlsi3, __floatsidf, __floatsisf, + __ashrsi3, __modhi3, __udivsi3, __fixdfsi, __fixunssfdi, + __udivhi3, __umodsi3, __nesf2, __lesf2, __floatundisf, + __fixdfdi, __fixunsdfsi, __modsi3, __floatunsisf, + __fixunsdfdi, __ltsf2, __floatdisf, __floatdidf, + __lshrsi3, __subsf3, __umodhi3, __floatunsidf, + __floatundidf +]; + +// EABI Libcalls - EABI Section 6.2 +def MSP430SystemLibrary +: SystemRuntimeLibrary, + __mspabi_cmpf__oeq, + __mspabi_cmpf__une, + __mspabi_cmpf__oge, + __mspabi_cmpf__olt, + __mspabi_cmpf__ole, + __mspabi_cmpf__ogt, + + // Floating point arithmetic - EABI Table 8 + LibcallsWithCC<(add __mspabi_addd, + __mspabi_subd, + __mspabi_mpyd, + __mspabi_divd), MSP430_BUILTIN>, + + __mspabi_addf, + __mspabi_subf, + __mspabi_mpyf, + __mspabi_divf, + + // The following are NOT implemented in libgcc + // __mspabi_negd, + // __mspabi_negf, + + // Universal Integer Operations - EABI Table 9 + __mspabi_divi, + __mspabi_divli, + LibcallsWithCC<(add __mspabi_divlli), MSP430_BUILTIN>, + __mspabi_divu, + __mspabi_divul, + LibcallsWithCC<(add __mspabi_divull), MSP430_BUILTIN>, + __mspabi_remi, + __mspabi_remli, + LibcallsWithCC<(add __mspabi_remlli), MSP430_BUILTIN>, + __mspabi_remu, + __mspabi_remul, + LibcallsWithCC<(add __mspabi_remull), MSP430_BUILTIN>, + + // Bitwise Operations - EABI Table 10 + // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc + __mspabi_srll, + __mspabi_sral, + __mspabi_slll + // __mspabi_[srlll/srall/s/rlli/rlll] are NOT implemented in libgcc + ) +>; + //===--===// // NVPTX Runtime Libcalls //===---
[llvm-branch-commits] [llvm] RuntimeLibcalls: Remove table of soft float compare cond codes (PR #146082)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/146082 >From effe1ad6d053a4dffccc3d68574868565ce94397 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 23 Jun 2025 19:10:30 +0900 Subject: [PATCH] RuntimeLibcalls: Remove table of soft float compare cond codes Previously we had a table of entries for every Libcall for the comparison to use against an integer 0 if it was a soft float compare function. This was only relevant to a handful of opcodes, so it was wasteful. Now that we can distinguish the abstract libcall for the compare with the concrete implementation, we can just directly hardcode the comparison against the libcall impl without this configuration system. --- .../include/llvm/CodeGen/RuntimeLibcallUtil.h | 3 - llvm/include/llvm/CodeGen/TargetLowering.h| 17 +- llvm/include/llvm/IR/RuntimeLibcalls.h| 32 +--- .../CodeGen/SelectionDAG/TargetLowering.cpp | 16 +- llvm/lib/CodeGen/TargetLoweringBase.cpp | 107 +++ llvm/lib/IR/RuntimeLibcalls.cpp | 31 --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 176 +- 7 files changed, 182 insertions(+), 200 deletions(-) diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h index 7481ed5b80b3f..09a8151e9ec9c 100644 --- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h +++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h @@ -105,9 +105,6 @@ LLVM_ABI Libcall getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize); /// UNKNOW_LIBCALL if there is none. LLVM_ABI Libcall getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize); -/// Initialize the default condition code on the libcalls. -LLVM_ABI void initCmpLibcallCCs(ISD::CondCode *CmpLibcallCCs); - } // namespace RTLIB } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index fee94cc167363..fa46d296bf533 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3571,19 +3571,10 @@ class LLVM_ABI TargetLoweringBase { const char *getMemcpyName() const { return Libcalls.getMemcpyName(); } - /// Override the default CondCode to be used to test the result of the - /// comparison libcall against zero. - /// FIXME: This should be removed - void setCmpLibcallCC(RTLIB::Libcall Call, CmpInst::Predicate Pred) { -Libcalls.setSoftFloatCmpLibcallPredicate(Call, Pred); - } - - /// Get the CondCode that's to be used to test the result of the comparison - /// libcall against zero. - CmpInst::Predicate - getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const { -return Libcalls.getSoftFloatCmpLibcallPredicate(Call); - } + /// Get the comparison predicate that's to be used to test the result of the + /// comparison libcall against zero. This should only be used with + /// floating-point compare libcalls. + ISD::CondCode getSoftFloatCmpLibcallPredicate(RTLIB::LibcallImpl Call) const; /// Set the CallingConv that should be used for the specified libcall. void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) { diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index 66d11c4cbabb7..ac83df3a4189e 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -59,7 +59,6 @@ struct RuntimeLibcallsInfo { ExceptionHandling ExceptionModel = ExceptionHandling::None, FloatABI::ABIType FloatABI = FloatABI::Default, EABI EABIVersion = EABI::Default, StringRef ABIName = "") { -initSoftFloatCmpLibcallPredicates(); initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion, ABIName); } @@ -106,22 +105,6 @@ struct RuntimeLibcallsInfo { return ArrayRef(LibcallImpls).drop_front(); } - /// Get the comparison predicate that's to be used to test the result of the - /// comparison libcall against zero. This should only be used with - /// floating-point compare libcalls. - // FIXME: This should be a function of RTLIB::LibcallImpl - CmpInst::Predicate - getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const { -return SoftFloatCompareLibcallPredicates[Call]; - } - - // FIXME: This should be removed. This should be private constant. - // FIXME: This should be a function of RTLIB::LibcallImpl - void setSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call, - CmpInst::Predicate Pred) { -SoftFloatCompareLibcallPredicates[Call] = Pred; - } - /// Return a function name compatible with RTLIB::MEMCPY, or nullptr if fully /// unsupported. const char *getMemcpyName() const { @@ -132,6 +115,11 @@ struct RuntimeLibcallsInfo { return getLibcallName(RTLIB::MEMMOVE); } + /// Return the libcall provided by \p Impl + static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) { +return ImplToLibcall[Impl]; + } + priva
[llvm-branch-commits] [llvm] Lanai: Use TableGen to set libcall calling conventions (PR #146080)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/146080 >From a492a7e695c2f543b6caba066f5d8beb4272b8cf Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 23 Jun 2025 16:17:26 +0900 Subject: [PATCH] Lanai: Use TableGen to set libcall calling conventions --- llvm/include/llvm/IR/RuntimeLibcalls.td | 12 llvm/lib/Target/Lanai/LanaiISelLowering.cpp | 4 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index f8667269ec8cb..c15ffa0653335 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -1389,6 +1389,18 @@ def __hexagon_fast2_sqrtdf2 : RuntimeLibcallImpl; def __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes : RuntimeLibcallImpl; +//===--===// +// Lanai Runtime Libcalls +//===--===// + +def isLanai : RuntimeLibcallPredicate<"TT.getArch() == Triple::lanai">; + +// Use fast calling convention for library functions. +def LanaiSystemLibrary +: SystemRuntimeLibrary { + let DefaultLibcallCallingConv = FASTCC; +} + //===--===// // Mips16 Runtime Libcalls //===--===// diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp index 6fb73c5d18966..d23c5f43ad4ff 100644 --- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp @@ -150,10 +150,6 @@ LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM, // statements. Re-evaluate this on new benchmarks. setMinimumJumpTableEntries(100); - // Use fast calling convention for library functions. - for (RTLIB::LibcallImpl LC : RTLIB::libcall_impls()) -setLibcallImplCallingConv(LC, CallingConv::Fast); - MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] RuntimeLibcalls: Associate calling convention with libcall impls (PR #144979)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/144979 >From 5a9b9d9720f01a7575f33feb73042eafcfa3f82c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 17 Jun 2025 16:25:50 +0900 Subject: [PATCH] RuntimeLibcalls: Associate calling convention with libcall impls Instead of associating the libcall with the RTLIB::Libcall, put it into a table indexed by the RTLIB::LibcallImpl. The LibcallImpls should contain all ABI details for a particular implementation, not the abstract Libcall. In the future the wrappers in terms of the RTLIB::Libcall should be removed. --- llvm/include/llvm/CodeGen/TargetLowering.h| 16 - llvm/include/llvm/IR/RuntimeLibcalls.h| 32 ++--- llvm/lib/IR/RuntimeLibcalls.cpp | 70 +++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 18 ++--- llvm/lib/Target/Lanai/LanaiISelLowering.cpp | 4 +- llvm/lib/Target/MSP430/MSP430ISelLowering.cpp | 3 +- 6 files changed, 92 insertions(+), 51 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 420f1d5fb20ca..fee94cc167363 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3559,6 +3559,11 @@ class LLVM_ABI TargetLoweringBase { Libcalls.setLibcallImpl(Call, Impl); } + /// Get the libcall impl routine name for the specified libcall. + RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const { +return Libcalls.getLibcallImpl(Call); + } + /// Get the libcall routine name for the specified libcall. const char *getLibcallName(RTLIB::Libcall Call) const { return Libcalls.getLibcallName(Call); @@ -3581,11 +3586,18 @@ class LLVM_ABI TargetLoweringBase { } /// Set the CallingConv that should be used for the specified libcall. - void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) { -Libcalls.setLibcallCallingConv(Call, CC); + void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) { +Libcalls.setLibcallImplCallingConv(Call, CC); + } + + /// Get the CallingConv that should be used for the specified libcall + /// implementation. + CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const { +return Libcalls.getLibcallImplCallingConv(Call); } /// Get the CallingConv that should be used for the specified libcall. + // FIXME: Remove this wrapper and directly use the used LibcallImpl CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const { return Libcalls.getLibcallCallingConv(Call); } diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index c8d97bcd2e664..66d11c4cbabb7 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -37,6 +37,10 @@ template <> struct enum_iteration_traits { static constexpr bool is_iterable = true; }; +template <> struct enum_iteration_traits { + static constexpr bool is_iterable = true; +}; + namespace RTLIB { // Return an iterator over all Libcall values. @@ -44,6 +48,10 @@ static inline auto libcalls() { return enum_seq(static_cast(0), RTLIB::UNKNOWN_LIBCALL); } +static inline auto libcall_impls() { + return enum_seq(static_cast(1), RTLIB::NumLibcallImpls); +} + /// A simple container for information about the supported runtime calls. struct RuntimeLibcallsInfo { explicit RuntimeLibcallsInfo( @@ -76,16 +84,21 @@ struct RuntimeLibcallsInfo { return LibcallImpls[Call]; } - /// Set the CallingConv that should be used for the specified libcall. - // FIXME: This should be a function of RTLIB::LibcallImpl - void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) { -LibcallCallingConvs[Call] = CC; + /// Set the CallingConv that should be used for the specified libcall + /// implementation + void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) { +LibcallImplCallingConvs[Call] = CC; } - /// Get the CallingConv that should be used for the specified libcall. - // FIXME: This should be a function of RTLIB::LibcallImpl + // FIXME: Remove this wrapper in favor of directly using + // getLibcallImplCallingConv CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const { -return LibcallCallingConvs[Call]; +return LibcallImplCallingConvs[LibcallImpls[Call]]; + } + + /// Get the CallingConv that should be used for the specified libcall. + CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const { +return LibcallImplCallingConvs[Call]; } ArrayRef getLibcallImpls() const { @@ -130,8 +143,9 @@ struct RuntimeLibcallsInfo { static_assert(static_cast(CallingConv::C) == 0, "default calling conv should be encoded as 0"); - /// Stores the CallingConv that should be used for each libcall. - CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL] = {}; + /// Stores th
[llvm-branch-commits] [llvm] ARM: Unconditionally set eabi libcall calling convs in RuntimeLibcalls (PR #146083)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/146083 >From c61003aede9ddd5db0503428f4dc500718e85028 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 23 Jun 2025 20:14:11 +0900 Subject: [PATCH 1/3] ARM: Unconditionally set eabi libcall calling convs in RuntimeLibcalls This fully consolidates all the calling convention configuration into RuntimeLibcallInfo. I'm assuming that __aeabi functions have a universal calling convention, and on other ABIs just don't use them. This will enable splitting of RuntimeLibcallInfo into the ABI and lowering component. --- llvm/lib/IR/RuntimeLibcalls.cpp | 39 ++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 159 +++- 2 files changed, 110 insertions(+), 88 deletions(-) diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index dec766698dc1d..e62743860d53d 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -98,6 +98,45 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT, Info.setLibcallImpl(RTLIB::SDIVREM_I32, RTLIB::__divmodsi4); Info.setLibcallImpl(RTLIB::UDIVREM_I32, RTLIB::__udivmodsi4); } + + static const RTLIB::LibcallImpl AAPCS_Libcalls[] = { + RTLIB::__aeabi_dadd, RTLIB::__aeabi_ddiv, + RTLIB::__aeabi_dmul, RTLIB::__aeabi_dsub, + RTLIB::__aeabi_dcmpeq__ne, RTLIB::__aeabi_dcmpeq__eq, + RTLIB::__aeabi_dcmplt, RTLIB::__aeabi_dcmple, + RTLIB::__aeabi_dcmpge, RTLIB::__aeabi_dcmpgt, + RTLIB::__aeabi_dcmpun, RTLIB::__aeabi_fadd, + RTLIB::__aeabi_fdiv, RTLIB::__aeabi_fmul, + RTLIB::__aeabi_fsub, RTLIB::__aeabi_fcmpeq__ne, + RTLIB::__aeabi_fcmpeq__eq, RTLIB::__aeabi_fcmplt, + RTLIB::__aeabi_fcmple, RTLIB::__aeabi_fcmpge, + RTLIB::__aeabi_fcmpgt, RTLIB::__aeabi_fcmpun, + RTLIB::__aeabi_d2iz, RTLIB::__aeabi_d2uiz, + RTLIB::__aeabi_d2lz, RTLIB::__aeabi_d2ulz, + RTLIB::__aeabi_f2iz, RTLIB::__aeabi_f2uiz, + RTLIB::__aeabi_f2lz, RTLIB::__aeabi_f2ulz, + RTLIB::__aeabi_d2f,RTLIB::__aeabi_d2h, + RTLIB::__aeabi_f2d,RTLIB::__aeabi_i2d, + RTLIB::__aeabi_ui2d, RTLIB::__aeabi_l2d, + RTLIB::__aeabi_ul2d, RTLIB::__aeabi_i2f, + RTLIB::__aeabi_ui2f, RTLIB::__aeabi_l2f, + RTLIB::__aeabi_ul2f, RTLIB::__aeabi_lmul, + RTLIB::__aeabi_llsl, RTLIB::__aeabi_llsr, + RTLIB::__aeabi_lasr, RTLIB::__aeabi_idiv__i8, + RTLIB::__aeabi_idiv__i16, RTLIB::__aeabi_idiv__i32, + RTLIB::__aeabi_ldivmod,RTLIB::__aeabi_uidiv__i8, + RTLIB::__aeabi_uidiv__i16, RTLIB::__aeabi_uidiv__i32, + RTLIB::__aeabi_uldivmod, RTLIB::__aeabi_f2h, + RTLIB::__aeabi_d2h,RTLIB::__aeabi_h2f, + RTLIB::__aeabi_memcpy, RTLIB::__aeabi_memmove, + RTLIB::__aeabi_memset, RTLIB::__aeabi_memcpy4, + RTLIB::__aeabi_memcpy8,RTLIB::__aeabi_memmove4, + RTLIB::__aeabi_memmove8, RTLIB::__aeabi_memset4, + RTLIB::__aeabi_memset8,RTLIB::__aeabi_memclr, + RTLIB::__aeabi_memclr4,RTLIB::__aeabi_memclr8}; + + for (RTLIB::LibcallImpl Impl : AAPCS_Libcalls) +Info.setLibcallImplCallingConv(Impl, CallingConv::ARM_AAPCS); } static void setLongDoubleIsF128Libm(RuntimeLibcallsInfo &Info, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 44dcbc9f26616..8c68c6d123514 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -578,9 +578,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, }; // clang-format on - for (const auto &LC : LibraryCalls) { + for (const auto &LC : LibraryCalls) setLibcallImpl(LC.Op, LC.Impl); - } } } @@ -594,94 +593,91 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, static const struct { const RTLIB::Libcall Op; const RTLIB::LibcallImpl Impl; - const CallingConv::ID CC; } LibraryCalls[] = { // Double-precision floating-point arithmetic helper functions // RTABI chapter 4.1.2, Table 2 - { RTLIB::ADD_F64, RTLIB::__aeabi_dadd, CallingConv::ARM_AAPCS }, - { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv, CallingConv::ARM_AAPCS }, - { RTLIB::MUL_F64, RTLIB::__aeabi_dmul, CallingConv::ARM_AAPCS }, - { RTLIB::SUB_F64, RTLIB::__aeabi_dsub, CallingConv::ARM_AAPCS }, + { RTLIB::ADD_F64, RTLIB::__aeabi_dadd }, + { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv }, + { RTLIB::MUL_F64, RTLIB::__aeabi_dmul }, + { RTLIB::SUB_F64, RTLIB::__aeabi_dsub }, // Double-precision floating-point comparison helper functions // RTABI chapter 4.1.2, Table 3 - { RTLIB::OEQ_F64, RTLIB::__aeabi_dcmpeq__ne, CallingConv::ARM_AAPCS }, - { RTLIB::UNE_F64, RTLIB::__aeabi_dcmpeq__eq, CallingConv::ARM_AAPCS }, - {
[llvm-branch-commits] [llvm] ARM: Unconditionally set eabi libcall calling convs in RuntimeLibcalls (PR #146083)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/146083 >From c61003aede9ddd5db0503428f4dc500718e85028 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 23 Jun 2025 20:14:11 +0900 Subject: [PATCH 1/3] ARM: Unconditionally set eabi libcall calling convs in RuntimeLibcalls This fully consolidates all the calling convention configuration into RuntimeLibcallInfo. I'm assuming that __aeabi functions have a universal calling convention, and on other ABIs just don't use them. This will enable splitting of RuntimeLibcallInfo into the ABI and lowering component. --- llvm/lib/IR/RuntimeLibcalls.cpp | 39 ++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 159 +++- 2 files changed, 110 insertions(+), 88 deletions(-) diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index dec766698dc1d..e62743860d53d 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -98,6 +98,45 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT, Info.setLibcallImpl(RTLIB::SDIVREM_I32, RTLIB::__divmodsi4); Info.setLibcallImpl(RTLIB::UDIVREM_I32, RTLIB::__udivmodsi4); } + + static const RTLIB::LibcallImpl AAPCS_Libcalls[] = { + RTLIB::__aeabi_dadd, RTLIB::__aeabi_ddiv, + RTLIB::__aeabi_dmul, RTLIB::__aeabi_dsub, + RTLIB::__aeabi_dcmpeq__ne, RTLIB::__aeabi_dcmpeq__eq, + RTLIB::__aeabi_dcmplt, RTLIB::__aeabi_dcmple, + RTLIB::__aeabi_dcmpge, RTLIB::__aeabi_dcmpgt, + RTLIB::__aeabi_dcmpun, RTLIB::__aeabi_fadd, + RTLIB::__aeabi_fdiv, RTLIB::__aeabi_fmul, + RTLIB::__aeabi_fsub, RTLIB::__aeabi_fcmpeq__ne, + RTLIB::__aeabi_fcmpeq__eq, RTLIB::__aeabi_fcmplt, + RTLIB::__aeabi_fcmple, RTLIB::__aeabi_fcmpge, + RTLIB::__aeabi_fcmpgt, RTLIB::__aeabi_fcmpun, + RTLIB::__aeabi_d2iz, RTLIB::__aeabi_d2uiz, + RTLIB::__aeabi_d2lz, RTLIB::__aeabi_d2ulz, + RTLIB::__aeabi_f2iz, RTLIB::__aeabi_f2uiz, + RTLIB::__aeabi_f2lz, RTLIB::__aeabi_f2ulz, + RTLIB::__aeabi_d2f,RTLIB::__aeabi_d2h, + RTLIB::__aeabi_f2d,RTLIB::__aeabi_i2d, + RTLIB::__aeabi_ui2d, RTLIB::__aeabi_l2d, + RTLIB::__aeabi_ul2d, RTLIB::__aeabi_i2f, + RTLIB::__aeabi_ui2f, RTLIB::__aeabi_l2f, + RTLIB::__aeabi_ul2f, RTLIB::__aeabi_lmul, + RTLIB::__aeabi_llsl, RTLIB::__aeabi_llsr, + RTLIB::__aeabi_lasr, RTLIB::__aeabi_idiv__i8, + RTLIB::__aeabi_idiv__i16, RTLIB::__aeabi_idiv__i32, + RTLIB::__aeabi_ldivmod,RTLIB::__aeabi_uidiv__i8, + RTLIB::__aeabi_uidiv__i16, RTLIB::__aeabi_uidiv__i32, + RTLIB::__aeabi_uldivmod, RTLIB::__aeabi_f2h, + RTLIB::__aeabi_d2h,RTLIB::__aeabi_h2f, + RTLIB::__aeabi_memcpy, RTLIB::__aeabi_memmove, + RTLIB::__aeabi_memset, RTLIB::__aeabi_memcpy4, + RTLIB::__aeabi_memcpy8,RTLIB::__aeabi_memmove4, + RTLIB::__aeabi_memmove8, RTLIB::__aeabi_memset4, + RTLIB::__aeabi_memset8,RTLIB::__aeabi_memclr, + RTLIB::__aeabi_memclr4,RTLIB::__aeabi_memclr8}; + + for (RTLIB::LibcallImpl Impl : AAPCS_Libcalls) +Info.setLibcallImplCallingConv(Impl, CallingConv::ARM_AAPCS); } static void setLongDoubleIsF128Libm(RuntimeLibcallsInfo &Info, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 44dcbc9f26616..8c68c6d123514 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -578,9 +578,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, }; // clang-format on - for (const auto &LC : LibraryCalls) { + for (const auto &LC : LibraryCalls) setLibcallImpl(LC.Op, LC.Impl); - } } } @@ -594,94 +593,91 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, static const struct { const RTLIB::Libcall Op; const RTLIB::LibcallImpl Impl; - const CallingConv::ID CC; } LibraryCalls[] = { // Double-precision floating-point arithmetic helper functions // RTABI chapter 4.1.2, Table 2 - { RTLIB::ADD_F64, RTLIB::__aeabi_dadd, CallingConv::ARM_AAPCS }, - { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv, CallingConv::ARM_AAPCS }, - { RTLIB::MUL_F64, RTLIB::__aeabi_dmul, CallingConv::ARM_AAPCS }, - { RTLIB::SUB_F64, RTLIB::__aeabi_dsub, CallingConv::ARM_AAPCS }, + { RTLIB::ADD_F64, RTLIB::__aeabi_dadd }, + { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv }, + { RTLIB::MUL_F64, RTLIB::__aeabi_dmul }, + { RTLIB::SUB_F64, RTLIB::__aeabi_dsub }, // Double-precision floating-point comparison helper functions // RTABI chapter 4.1.2, Table 3 - { RTLIB::OEQ_F64, RTLIB::__aeabi_dcmpeq__ne, CallingConv::ARM_AAPCS }, - { RTLIB::UNE_F64, RTLIB::__aeabi_dcmpeq__eq, CallingConv::ARM_AAPCS }, - {
[llvm-branch-commits] [llvm] RuntimeLibcalls: Associate calling convention with libcall impls (PR #144979)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/144979 >From 5a9b9d9720f01a7575f33feb73042eafcfa3f82c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 17 Jun 2025 16:25:50 +0900 Subject: [PATCH] RuntimeLibcalls: Associate calling convention with libcall impls Instead of associating the libcall with the RTLIB::Libcall, put it into a table indexed by the RTLIB::LibcallImpl. The LibcallImpls should contain all ABI details for a particular implementation, not the abstract Libcall. In the future the wrappers in terms of the RTLIB::Libcall should be removed. --- llvm/include/llvm/CodeGen/TargetLowering.h| 16 - llvm/include/llvm/IR/RuntimeLibcalls.h| 32 ++--- llvm/lib/IR/RuntimeLibcalls.cpp | 70 +++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 18 ++--- llvm/lib/Target/Lanai/LanaiISelLowering.cpp | 4 +- llvm/lib/Target/MSP430/MSP430ISelLowering.cpp | 3 +- 6 files changed, 92 insertions(+), 51 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 420f1d5fb20ca..fee94cc167363 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3559,6 +3559,11 @@ class LLVM_ABI TargetLoweringBase { Libcalls.setLibcallImpl(Call, Impl); } + /// Get the libcall impl routine name for the specified libcall. + RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const { +return Libcalls.getLibcallImpl(Call); + } + /// Get the libcall routine name for the specified libcall. const char *getLibcallName(RTLIB::Libcall Call) const { return Libcalls.getLibcallName(Call); @@ -3581,11 +3586,18 @@ class LLVM_ABI TargetLoweringBase { } /// Set the CallingConv that should be used for the specified libcall. - void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) { -Libcalls.setLibcallCallingConv(Call, CC); + void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) { +Libcalls.setLibcallImplCallingConv(Call, CC); + } + + /// Get the CallingConv that should be used for the specified libcall + /// implementation. + CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const { +return Libcalls.getLibcallImplCallingConv(Call); } /// Get the CallingConv that should be used for the specified libcall. + // FIXME: Remove this wrapper and directly use the used LibcallImpl CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const { return Libcalls.getLibcallCallingConv(Call); } diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index c8d97bcd2e664..66d11c4cbabb7 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -37,6 +37,10 @@ template <> struct enum_iteration_traits { static constexpr bool is_iterable = true; }; +template <> struct enum_iteration_traits { + static constexpr bool is_iterable = true; +}; + namespace RTLIB { // Return an iterator over all Libcall values. @@ -44,6 +48,10 @@ static inline auto libcalls() { return enum_seq(static_cast(0), RTLIB::UNKNOWN_LIBCALL); } +static inline auto libcall_impls() { + return enum_seq(static_cast(1), RTLIB::NumLibcallImpls); +} + /// A simple container for information about the supported runtime calls. struct RuntimeLibcallsInfo { explicit RuntimeLibcallsInfo( @@ -76,16 +84,21 @@ struct RuntimeLibcallsInfo { return LibcallImpls[Call]; } - /// Set the CallingConv that should be used for the specified libcall. - // FIXME: This should be a function of RTLIB::LibcallImpl - void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) { -LibcallCallingConvs[Call] = CC; + /// Set the CallingConv that should be used for the specified libcall + /// implementation + void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) { +LibcallImplCallingConvs[Call] = CC; } - /// Get the CallingConv that should be used for the specified libcall. - // FIXME: This should be a function of RTLIB::LibcallImpl + // FIXME: Remove this wrapper in favor of directly using + // getLibcallImplCallingConv CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const { -return LibcallCallingConvs[Call]; +return LibcallImplCallingConvs[LibcallImpls[Call]]; + } + + /// Get the CallingConv that should be used for the specified libcall. + CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const { +return LibcallImplCallingConvs[Call]; } ArrayRef getLibcallImpls() const { @@ -130,8 +143,9 @@ struct RuntimeLibcallsInfo { static_assert(static_cast(CallingConv::C) == 0, "default calling conv should be encoded as 0"); - /// Stores the CallingConv that should be used for each libcall. - CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL] = {}; + /// Stores th
[llvm-branch-commits] [llvm] ARM: Start moving runtime libcalls into tablegen (PR #146084)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/146084 >From 8eb2e09e5f533bbf706445437d7cf5590d775fab Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 23 Jun 2025 23:23:01 +0900 Subject: [PATCH] ARM: Start moving runtime libcalls into tablegen We still need to manually set the calling conventions of some libcalls until the lowering is separated out. --- llvm/include/llvm/IR/RuntimeLibcalls.h| 2 +- llvm/include/llvm/IR/RuntimeLibcalls.td | 48 llvm/lib/IR/RuntimeLibcalls.cpp | 73 +-- .../RuntimeLibcallEmitter-calling-conv.td | 2 +- llvm/test/TableGen/RuntimeLibcallEmitter.td | 2 +- .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 2 +- 6 files changed, 53 insertions(+), 76 deletions(-) diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index ac83df3a4189e..4ea5ff9c9ade8 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -170,7 +170,7 @@ struct RuntimeLibcallsInfo { void initDefaultLibCallImpls(); /// Generated by tablegen. - void setTargetRuntimeLibcallSets(const Triple &TT); + void setTargetRuntimeLibcallSets(const Triple &TT, FloatABI::ABIType FloatABI); /// Set default libcall names. If a target wants to opt-out of a libcall it /// should be placed here. diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index c15ffa0653335..0237c8b41ae8c 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -17,6 +17,7 @@ class DuplicateLibcallImplWithPrefix /// Libcall Predicates def isOSDarwin : RuntimeLibcallPredicate<"TT.isOSDarwin()">; +def isOSWindows : RuntimeLibcallPredicate<"TT.isOSWindows()">; def darwinHasSinCosStret : RuntimeLibcallPredicate<"darwinHasSinCosStret(TT)">; def darwinHasExp10 : RuntimeLibcallPredicate<"darwinHasExp10(TT)">; @@ -1272,6 +1273,7 @@ def __aeabi_memclr4 : RuntimeLibcallImpl; def __aeabi_memclr8 : RuntimeLibcallImpl; // isTargetWindows() +defset list WindowsFPIntCastLibcalls = { def __stoi64 : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_VFP def __dtoi64 : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_VFP def __stou64 : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_VFP @@ -1280,6 +1282,7 @@ def __i64tos : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_V def __i64tod : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_VFP def __u64tos : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_VFP def __u64tod : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS_VFP +} def __rt_sdiv : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS def __rt_sdiv64 : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS @@ -1306,6 +1309,51 @@ def __aeabi_h2f : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS def __gnu_f2h_ieee : RuntimeLibcallImpl; def __gnu_h2f_ieee : RuntimeLibcallImpl; + +def WindowARMDivRemCalls : LibcallImpls< + (add __rt_sdiv, __rt_sdiv64, __rt_udiv, __rt_udiv64), + isOSWindows> { + let CallingConv = ARM_AAPCS; +} + +def WindowARMFPIntCasts : LibcallImpls< + (add WindowsFPIntCastLibcalls), + isOSWindows> { + let CallingConv = ARM_AAPCS_VFP; +} + + +// Register based DivRem for AEABI (RTABI 4.2) +def AEABIDivRemCalls : LibcallImpls< + (add __aeabi_idivmod, __aeabi_ldivmod, + __aeabi_uidivmod, __aeabi_uldivmod), + RuntimeLibcallPredicate<[{TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() || +TT.isTargetMuslAEABI()}]>> { + let CallingConv = ARM_AAPCS; +} + +def isARMOrThumb : RuntimeLibcallPredicate<"TT.isARM() || TT.isThumb()">; + +def ARMSystemLibrary +: SystemRuntimeLibrary>)> { + let DefaultLibcallCallingConv = LibcallCallingConv<[{ + (!TT.isOSDarwin() && !TT.isiOS() && !TT.isWatchOS() && !TT.isDriverKit()) ? +(FloatABI == FloatABI::Hard ? CallingConv::ARM_AAPCS_VFP +: CallingConv::ARM_AAPCS) : + CallingConv::C + }]>; +} + //===--===// // AVR Runtime Libcalls //===--===// diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index c2d0b0684ec39..2d168befd145c 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -25,77 +25,6 @@ static cl::opt static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT, FloatABI::ABIType FloatABIType, EABI EABIVersion) { - if (!TT.isOSDarwin() && !TT.isiOS() && !TT.isWatchOS() && !TT.isDriverKit()) { -CallingConv::ID DefaultCC = FloatABIType == FloatABI::Hard -? CallingConv::ARM_AAPCS_VFP -: CallingConv::ARM_AAPCS; -for (RTLIB::LibcallImpl LC : RTLIB::libcall_i
[llvm-branch-commits] [llvm] RuntimeLibcalls: Remove table of soft float compare cond codes (PR #146082)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/146082 >From effe1ad6d053a4dffccc3d68574868565ce94397 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 23 Jun 2025 19:10:30 +0900 Subject: [PATCH] RuntimeLibcalls: Remove table of soft float compare cond codes Previously we had a table of entries for every Libcall for the comparison to use against an integer 0 if it was a soft float compare function. This was only relevant to a handful of opcodes, so it was wasteful. Now that we can distinguish the abstract libcall for the compare with the concrete implementation, we can just directly hardcode the comparison against the libcall impl without this configuration system. --- .../include/llvm/CodeGen/RuntimeLibcallUtil.h | 3 - llvm/include/llvm/CodeGen/TargetLowering.h| 17 +- llvm/include/llvm/IR/RuntimeLibcalls.h| 32 +--- .../CodeGen/SelectionDAG/TargetLowering.cpp | 16 +- llvm/lib/CodeGen/TargetLoweringBase.cpp | 107 +++ llvm/lib/IR/RuntimeLibcalls.cpp | 31 --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 176 +- 7 files changed, 182 insertions(+), 200 deletions(-) diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h index 7481ed5b80b3f..09a8151e9ec9c 100644 --- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h +++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h @@ -105,9 +105,6 @@ LLVM_ABI Libcall getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize); /// UNKNOW_LIBCALL if there is none. LLVM_ABI Libcall getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize); -/// Initialize the default condition code on the libcalls. -LLVM_ABI void initCmpLibcallCCs(ISD::CondCode *CmpLibcallCCs); - } // namespace RTLIB } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index fee94cc167363..fa46d296bf533 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3571,19 +3571,10 @@ class LLVM_ABI TargetLoweringBase { const char *getMemcpyName() const { return Libcalls.getMemcpyName(); } - /// Override the default CondCode to be used to test the result of the - /// comparison libcall against zero. - /// FIXME: This should be removed - void setCmpLibcallCC(RTLIB::Libcall Call, CmpInst::Predicate Pred) { -Libcalls.setSoftFloatCmpLibcallPredicate(Call, Pred); - } - - /// Get the CondCode that's to be used to test the result of the comparison - /// libcall against zero. - CmpInst::Predicate - getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const { -return Libcalls.getSoftFloatCmpLibcallPredicate(Call); - } + /// Get the comparison predicate that's to be used to test the result of the + /// comparison libcall against zero. This should only be used with + /// floating-point compare libcalls. + ISD::CondCode getSoftFloatCmpLibcallPredicate(RTLIB::LibcallImpl Call) const; /// Set the CallingConv that should be used for the specified libcall. void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) { diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index 66d11c4cbabb7..ac83df3a4189e 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -59,7 +59,6 @@ struct RuntimeLibcallsInfo { ExceptionHandling ExceptionModel = ExceptionHandling::None, FloatABI::ABIType FloatABI = FloatABI::Default, EABI EABIVersion = EABI::Default, StringRef ABIName = "") { -initSoftFloatCmpLibcallPredicates(); initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion, ABIName); } @@ -106,22 +105,6 @@ struct RuntimeLibcallsInfo { return ArrayRef(LibcallImpls).drop_front(); } - /// Get the comparison predicate that's to be used to test the result of the - /// comparison libcall against zero. This should only be used with - /// floating-point compare libcalls. - // FIXME: This should be a function of RTLIB::LibcallImpl - CmpInst::Predicate - getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const { -return SoftFloatCompareLibcallPredicates[Call]; - } - - // FIXME: This should be removed. This should be private constant. - // FIXME: This should be a function of RTLIB::LibcallImpl - void setSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call, - CmpInst::Predicate Pred) { -SoftFloatCompareLibcallPredicates[Call] = Pred; - } - /// Return a function name compatible with RTLIB::MEMCPY, or nullptr if fully /// unsupported. const char *getMemcpyName() const { @@ -132,6 +115,11 @@ struct RuntimeLibcallsInfo { return getLibcallName(RTLIB::MEMMOVE); } + /// Return the libcall provided by \p Impl + static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) { +return ImplToLibcall[Impl]; + } + priva
[llvm-branch-commits] [llvm] [mlir] [mlir][GPU][transform] Add gpu_to_rocdl conversion pattern to transfo… (PR #146962)
ftynse wrote: Actually, re: > Authored-by: Son Tuan Vu [vu...@google.com](mailto:vu...@google.com) I think you should reupload after patching the commit to specify the correct author `git commit --amend --author="..."`. Github tracks this correctly. https://github.com/llvm/llvm-project/pull/146962 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f arsenm wrote: These should always be printed with the named counter syntax https://github.com/llvm/llvm-project/pull/147257 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f Pierre-vh wrote: That's unexpected right ? Same for the vmcnt wait above https://github.com/llvm/llvm-project/pull/147257 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/147257 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/141589 >From d906a978145aabae8b2d1a029477d5a08272ae8c Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 27 May 2025 11:16:16 +0200 Subject: [PATCH 1/4] [AMDGPU] Move S_BFE lowering into RegBankCombiner --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 14 +- .../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 51 +++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 125 -- 3 files changed, 119 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 9587fad1ecd63..94e1175b06b14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[ canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl ]>; +// Early select of uniform BFX into S_BFE instructions. +// These instructions encode the offset/width in a way that requires using +// bitwise operations. Selecting these instructions early allow the combiner +// to potentially fold these. +class lower_uniform_bfx : GICombineRule< + (defs root:$bfx), + (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); }])>; + +def lower_uniform_sbfx : lower_uniform_bfx; +def lower_uniform_ubfx : lower_uniform_bfx; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This // saves one instruction compared to the promotion. @@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner< zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, - cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> { + cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, + lower_uniform_sbfx, lower_uniform_ubfx]> { } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index ee324a5e93f0f..2100900bb8eb2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner { void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const; + bool lowerUniformBFX(MachineInstr &MI) const; + private: SIModeRegisterDefaults getMode() const; bool getIEEE() const; @@ -392,6 +394,55 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( MI.eraseFromParent(); } +bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const { + assert(MI.getOpcode() == TargetOpcode::G_UBFX || + MI.getOpcode() == TargetOpcode::G_SBFX); + const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX); + + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI); + assert(RB && "No RB?"); + if (RB->getID() != AMDGPU::SGPRRegBankID) +return false; + + Register SrcReg = MI.getOperand(1).getReg(); + Register OffsetReg = MI.getOperand(2).getReg(); + Register WidthReg = MI.getOperand(3).getReg(); + + const LLT S32 = LLT::scalar(32); + LLT Ty = MRI.getType(DstReg); + + const unsigned Opc = (Ty == S32) + ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) + : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); + + // Ensure the high bits are clear to insert the offset. + auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6)); + auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); + + // Zeros out the low bits, so don't bother clamping the input value. + auto ShiftAmt = B.buildConstant(S32, 16); + auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt); + + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); + + MRI.setRegBank(OffsetMask.getReg(0), *RB); + MRI.setRegBank(ClampOffset.getReg(0), *RB); + MRI.setRegBank(ShiftAmt.getReg(0), *RB); + MRI.setRegBank(ShiftWidth.getReg(0), *RB); + MRI.setRegBank(MergedInputs.getReg(0), *RB); + + auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); + if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) +llvm_unreachable("failed to constrain BFE"); + + MI.eraseFromParent(); + return true; +} + SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const { return MF.getInfo()->getMode(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 6874657a4ffe7..140c2babb013f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/li
[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/141591 >From b386d126b9f560bf203fd044d81575ddfad2a8c6 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 27 May 2025 12:29:02 +0200 Subject: [PATCH 1/2] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 3 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 59 - .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 61 +++--- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 63 +++ llvm/test/CodeGen/AMDGPU/div_i128.ll | 30 - llvm/test/CodeGen/AMDGPU/itofp.i128.ll| 11 ++-- llvm/test/CodeGen/AMDGPU/lround.ll| 18 +++--- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 16 + 8 files changed, 104 insertions(+), 157 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 96be17c487130..df867aaa204b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner< fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, - lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> { + lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract, + known_bits_simplifications]> { } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 6baa10bb48621..cc0f45681a3e2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6-LABEL: v_lshr_i65_33: ; GFX6: ; %bb.0: ; GFX6-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT:v_mov_b32_e32 v3, v1 -; GFX6-NEXT:v_mov_b32_e32 v0, 1 +; GFX6-NEXT:v_mov_b32_e32 v3, 1 +; GFX6-NEXT:v_mov_b32_e32 v4, 0 +; GFX6-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX6-NEXT:v_lshl_b64 v[2:3], v[3:4], 31 +; GFX6-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX6-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT:v_mov_b32_e32 v1, 0 -; GFX6-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX6-NEXT:v_lshl_b64 v[0:1], v[0:1], 31 -; GFX6-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX6-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT:v_mov_b32_e32 v2, 0 ; GFX6-NEXT:s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_i65_33: ; GFX8: ; %bb.0: ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT:v_mov_b32_e32 v3, v1 -; GFX8-NEXT:v_mov_b32_e32 v0, 1 +; GFX8-NEXT:v_mov_b32_e32 v3, 1 +; GFX8-NEXT:v_mov_b32_e32 v4, 0 +; GFX8-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX8-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX8-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX8-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT:v_mov_b32_e32 v1, 0 -; GFX8-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX8-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX8-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT:v_mov_b32_e32 v2, 0 ; GFX8-NEXT:s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_lshr_i65_33: ; GFX9: ; %bb.0: ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT:v_mov_b32_e32 v3, v1 -; GFX9-NEXT:v_mov_b32_e32 v0, 1 +; GFX9-NEXT:v_mov_b32_e32 v3, 1 +; GFX9-NEXT:v_mov_b32_e32 v4, 0 +; GFX9-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX9-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX9-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX9-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT:v_mov_b32_e32 v1, 0 -; GFX9-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX9-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX9-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX9-NEXT:v_mov_b32_e32 v2, 0 ; GFX9-NEXT:s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_lshr_i65_33: ; GFX10: ; %bb.0: ; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT:v_mov_b32_e32 v3, v1 -; GFX10-NEXT:v_mov_b32_e32 v0, 1 +; GFX10-NEXT:v_mov_b32_e32 v3, 1 +; GFX10-NEXT:v_mov_b32_e32 v4, 0 +; GFX10-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX10-NEXT:v_lshrrev_b32_e32 v0, 1, v1 ; GFX10-NEXT:v_mov_b32_e32 v1, 0 -; GFX10-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX10-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX10-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX10-NEXT:v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX10-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT:v_mov_b32_e32 v2, 0 ; GFX10-NEXT:s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_lshr_i65_33: ; GFX11: ; %bb.0: ; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT:v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1 -; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_an
[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/141591 >From b386d126b9f560bf203fd044d81575ddfad2a8c6 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 27 May 2025 12:29:02 +0200 Subject: [PATCH 1/2] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 3 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 59 - .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 61 +++--- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 63 +++ llvm/test/CodeGen/AMDGPU/div_i128.ll | 30 - llvm/test/CodeGen/AMDGPU/itofp.i128.ll| 11 ++-- llvm/test/CodeGen/AMDGPU/lround.ll| 18 +++--- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 16 + 8 files changed, 104 insertions(+), 157 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 96be17c487130..df867aaa204b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner< fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, - lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> { + lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract, + known_bits_simplifications]> { } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 6baa10bb48621..cc0f45681a3e2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6-LABEL: v_lshr_i65_33: ; GFX6: ; %bb.0: ; GFX6-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT:v_mov_b32_e32 v3, v1 -; GFX6-NEXT:v_mov_b32_e32 v0, 1 +; GFX6-NEXT:v_mov_b32_e32 v3, 1 +; GFX6-NEXT:v_mov_b32_e32 v4, 0 +; GFX6-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX6-NEXT:v_lshl_b64 v[2:3], v[3:4], 31 +; GFX6-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX6-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT:v_mov_b32_e32 v1, 0 -; GFX6-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX6-NEXT:v_lshl_b64 v[0:1], v[0:1], 31 -; GFX6-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX6-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT:v_mov_b32_e32 v2, 0 ; GFX6-NEXT:s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_i65_33: ; GFX8: ; %bb.0: ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT:v_mov_b32_e32 v3, v1 -; GFX8-NEXT:v_mov_b32_e32 v0, 1 +; GFX8-NEXT:v_mov_b32_e32 v3, 1 +; GFX8-NEXT:v_mov_b32_e32 v4, 0 +; GFX8-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX8-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX8-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX8-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT:v_mov_b32_e32 v1, 0 -; GFX8-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX8-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX8-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT:v_mov_b32_e32 v2, 0 ; GFX8-NEXT:s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_lshr_i65_33: ; GFX9: ; %bb.0: ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT:v_mov_b32_e32 v3, v1 -; GFX9-NEXT:v_mov_b32_e32 v0, 1 +; GFX9-NEXT:v_mov_b32_e32 v3, 1 +; GFX9-NEXT:v_mov_b32_e32 v4, 0 +; GFX9-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX9-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX9-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX9-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT:v_mov_b32_e32 v1, 0 -; GFX9-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX9-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX9-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX9-NEXT:v_mov_b32_e32 v2, 0 ; GFX9-NEXT:s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_lshr_i65_33: ; GFX10: ; %bb.0: ; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT:v_mov_b32_e32 v3, v1 -; GFX10-NEXT:v_mov_b32_e32 v0, 1 +; GFX10-NEXT:v_mov_b32_e32 v3, 1 +; GFX10-NEXT:v_mov_b32_e32 v4, 0 +; GFX10-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX10-NEXT:v_lshrrev_b32_e32 v0, 1, v1 ; GFX10-NEXT:v_mov_b32_e32 v1, 0 -; GFX10-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX10-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX10-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX10-NEXT:v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX10-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT:v_mov_b32_e32 v2, 0 ; GFX10-NEXT:s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_lshr_i65_33: ; GFX11: ; %bb.0: ; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT:v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1 -; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_an
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f ssahasra wrote: > These should always be printed with the named counter syntax I haven't check what's different about this wait count for it to be printed like this. Will need to follow it up as a separate change. https://github.com/llvm/llvm-project/pull/147257 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f ssahasra wrote: If we agree with the basic design, then these are expected. There's a whole bunch of tests that either stop at the memory legalizer, or they run llc with `-O0`, like this one. The "trivial" wait counts show up in all these tests because SIInsertWaitcnts did not get a chance to clean it up. In particular, see how `TrySimplify` in that pass controls whether or not to clean up these wait counts. They disappear in the optimized ISA output. https://github.com/llvm/llvm-project/pull/147257 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f Pierre-vh wrote: The waitcnts aren't optimized out at O0 because we want to see them in memory legalizer tests, however we're mostly interested in the waitcnt zero, not the waitcnt ~0 We could still optimize out the ~0 ones, I don't think there is a downside to that https://github.com/llvm/llvm-project/pull/147257 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f ssahasra wrote: Yes, I did consider that as an option. But there is the hypothetical corner case where the memory legalizer might deliberately compute the wait count to be so large that it gets clamped at the max value (not the same as ~0, strictly speaking). If that is not an issue, it will significantly reduce the diff for tests that don't stop after the legalizer. https://github.com/llvm/llvm-project/pull/147257 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Triple: Record default exception handling type (PR #147225)
https://github.com/jhuber6 approved this pull request. https://github.com/llvm/llvm-project/pull/147225 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] ARM: Remove subtarget field tracking SjLj (PR #147226)
https://github.com/DanielKristofKiss approved this pull request. lgtm https://github.com/llvm/llvm-project/pull/147226 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Triple: Record default exception handling type (PR #147225)
https://github.com/DanielKristofKiss approved this pull request. https://github.com/llvm/llvm-project/pull/147225 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
ssahasra wrote: This is part of a stack: - #147258 - #147257 - #147256 https://github.com/llvm/llvm-project/pull/147257 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] efficiently wait for direct loads to LDS at all scopes (PR #147258)
ssahasra wrote: This is part of a stack: - #147258 - #147257 - #147256 https://github.com/llvm/llvm-project/pull/147258 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] 902eaa1 - Revert "[clang][modules-driver] Add scanner to detect C++20 module presence (…"
Author: Corentin Jabot Date: 2025-07-07T14:05:22+02:00 New Revision: 902eaa1d5e24beb2a7129c61dfa84759a668f62f URL: https://github.com/llvm/llvm-project/commit/902eaa1d5e24beb2a7129c61dfa84759a668f62f DIFF: https://github.com/llvm/llvm-project/commit/902eaa1d5e24beb2a7129c61dfa84759a668f62f.diff LOG: Revert "[clang][modules-driver] Add scanner to detect C++20 module presence (…" This reverts commit ded142671663c404f4d9fb9ef4867b4fc680409a. Added: Modified: clang/include/clang/Basic/DiagnosticDriverKinds.td clang/include/clang/Basic/DiagnosticGroups.td clang/include/clang/Driver/Driver.h clang/include/clang/Driver/Options.td clang/lib/Driver/Driver.cpp clang/test/Frontend/warning-options.cpp Removed: clang/test/Driver/modules-driver-cxx20-module-usage-scanner.cpp diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index efba09dc140f6..34b6c0d7a8acd 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -577,16 +577,6 @@ def err_drv_reduced_module_output_overrided : Warning< "please consider use '-fmodule-output=' to specify the output file for reduced BMI explicitly">, InGroup>; -def remark_found_cxx20_module_usage : Remark< - "found C++20 module usage in file '%0'">, - InGroup; -def remark_performing_driver_managed_module_build : Remark< - "performing driver managed module build">, - InGroup; -def warn_modules_driver_unsupported_standard : Warning< - "'-fmodules-driver' is not supported before C++20">, - InGroup; - def warn_drv_delayed_template_parsing_after_cxx20 : Warning< "-fdelayed-template-parsing is deprecated after C++20">, InGroup>; diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index d2aa380f8d73f..36fa3227fd6a6 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -625,7 +625,6 @@ def ModuleConflict : DiagGroup<"module-conflict">; def ModuleFileExtension : DiagGroup<"module-file-extension">; def ModuleIncludeDirectiveTranslation : DiagGroup<"module-include-translation">; def ModuleMap : DiagGroup<"module-map">; -def ModulesDriver : DiagGroup<"modules-driver">; def RoundTripCC1Args : DiagGroup<"round-trip-cc1-args">; def NewlineEOF : DiagGroup<"newline-eof">; def Nullability : DiagGroup<"nullability">; diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h index 2a04f6dd27655..d9e328fe918bc 100644 --- a/clang/include/clang/Driver/Driver.h +++ b/clang/include/clang/Driver/Driver.h @@ -504,9 +504,6 @@ class Driver { /// BuildActions - Construct the list of actions to perform for the /// given arguments, which are only done for a single architecture. - /// If the compilation is an explicit module build, delegates to - /// BuildDriverManagedModuleBuildActions. Otherwise, BuildDefaultActions is - /// used. /// /// \param C - The compilation that is being built. /// \param Args - The input arguments. @@ -792,35 +789,6 @@ class Driver { /// compilation based on which -f(no-)?lto(=.*)? option occurs last. void setLTOMode(const llvm::opt::ArgList &Args); - /// BuildDefaultActions - Constructs the list of actions to perform - /// for the provided arguments, which are only done for a single architecture. - /// - /// \param C - The compilation that is being built. - /// \param Args - The input arguments. - /// \param Actions - The list to store the resulting actions onto. - void BuildDefaultActions(Compilation &C, llvm::opt::DerivedArgList &Args, - const InputList &Inputs, ActionList &Actions) const; - - /// BuildDriverManagedModuleBuildActions - Performs a dependency - /// scan and constructs the list of actions to perform for dependency order - /// and the provided arguments. This is only done for a single a architecture. - /// - /// \param C - The compilation that is being built. - /// \param Args - The input arguments. - /// \param Actions - The list to store the resulting actions onto. - void BuildDriverManagedModuleBuildActions(Compilation &C, -llvm::opt::DerivedArgList &Args, -const InputList &Inputs, -ActionList &Actions) const; - - /// Scans the leading lines of the C++ source inputs to detect C++20 module - /// usage. - /// - /// \returns True if module usage is detected, false otherwise, or an error on - /// read failure. - llvm::ErrorOr - ScanInputsForCXXModuleUsage(const InputList &Inputs) const; - /// Retrieves a ToolChain for a particular \p Target triple. /// /// Will cache ToolChains for the life of the driver object, and create them
[llvm-branch-commits] [llvm] [AMDGPU] always emit a soft wait even if it is trivially ~0 (PR #147257)
@@ -669,6 +679,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-WGP-NEXT:s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT:s_wait_storecnt 0x0 ; GFX12-WGP-NEXT:global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-WGP-NEXT:s_wait_loadcnt 0x3f Pierre-vh wrote: That's a valid concern, though the MemoryLegalizer currently only inserts waitcnts 0, I think? I also don't see why the memory legalizer would insert non-zero soft waitcnts, I think those would need to be non-soft (but that's not enforced anywhere, afaik) https://github.com/llvm/llvm-project/pull/147257 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [LifetimeSafety] Implement dataflow analysis for loan propagation (PR #147295)
https://github.com/usx95 updated https://github.com/llvm/llvm-project/pull/147295 >From e870b040c4ef29b7ca2e50c1fc0ab5a2446f5cf6 Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Sun, 6 Jul 2025 19:12:55 + Subject: [PATCH] [LifetimeSafety] Propagate loans using dataflow analysis --- clang/lib/Analysis/LifetimeSafety.cpp | 258 +- .../Sema/warn-lifetime-safety-dataflow.cpp| 186 + 2 files changed, 443 insertions(+), 1 deletion(-) diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp index 2c2309de90e26..e881e592ef59f 100644 --- a/clang/lib/Analysis/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety.cpp @@ -13,7 +13,10 @@ #include "clang/Analysis/Analyses/PostOrderCFGView.h" #include "clang/Analysis/AnalysisDeclContext.h" #include "clang/Analysis/CFG.h" +#include "clang/Analysis/FlowSensitive/DataflowWorklist.h" #include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/ImmutableMap.h" +#include "llvm/ADT/ImmutableSet.h" #include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Debug.h" @@ -482,7 +485,247 @@ class FactGenerator : public ConstStmtVisitor { }; // = // -// TODO: Run dataflow analysis to propagate loans, analyse and error reporting. +// The Dataflow Lattice +// = // + +// Using LLVM's immutable collections is efficient for dataflow analysis +// as it avoids deep copies during state transitions. +// TODO(opt): Consider using a bitset to represent the set of loans. +using LoanSet = llvm::ImmutableSet; +using OriginLoanMap = llvm::ImmutableMap; + +/// An object to hold the factories for immutable collections, ensuring +/// that all created states share the same underlying memory management. +struct LifetimeFactory { + OriginLoanMap::Factory OriginMapFact; + LoanSet::Factory LoanSetFact; + + LoanSet createLoanSet(LoanID LID) { +return LoanSetFact.add(LoanSetFact.getEmptySet(), LID); + } +}; + +/// LifetimeLattice represents the state of our analysis at a given program +/// point. It is an immutable object, and all operations produce a new +/// instance rather than modifying the existing one. +struct LifetimeLattice { + /// The map from an origin to the set of loans it contains. + /// TODO(opt): To reduce the lattice size, propagate origins of declarations, + /// not expressions, because expressions are not visible across blocks. + OriginLoanMap Origins = OriginLoanMap(nullptr); + + explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {} + LifetimeLattice() = default; + + bool operator==(const LifetimeLattice &Other) const { +return Origins == Other.Origins; + } + bool operator!=(const LifetimeLattice &Other) const { +return !(*this == Other); + } + + LoanSet getLoans(OriginID OID, LifetimeFactory &Factory) const { +if (auto *Loans = Origins.lookup(OID)) + return *Loans; +return Factory.LoanSetFact.getEmptySet(); + } + + /// Computes the union of two lattices by performing a key-wise join of + /// their OriginLoanMaps. + // TODO(opt): This key-wise join is a performance bottleneck. A more + // efficient merge could be implemented using a Patricia Trie or HAMT + // instead of the current AVL-tree-based ImmutableMap. + LifetimeLattice join(const LifetimeLattice &Other, + LifetimeFactory &Factory) const { +/// Merge the smaller map into the larger one ensuring we iterate over the +/// smaller map. +if (Origins.getHeight() < Other.Origins.getHeight()) + return Other.join(*this, Factory); + +OriginLoanMap JoinedState = Origins; +// For each origin in the other map, union its loan set with ours. +for (const auto &Entry : Other.Origins) { + OriginID OID = Entry.first; + LoanSet OtherLoanSet = Entry.second; + JoinedState = Factory.OriginMapFact.add( + JoinedState, OID, + join(getLoans(OID, Factory), OtherLoanSet, Factory)); +} +return LifetimeLattice(JoinedState); + } + + LoanSet join(LoanSet a, LoanSet b, LifetimeFactory &Factory) const { +/// Merge the smaller set into the larger one ensuring we iterate over the +/// smaller set. +if (a.getHeight() < b.getHeight()) + std::swap(a, b); +LoanSet Result = a; +for (LoanID LID : b) { + /// TODO(opt): Profiling shows that this loop is a major performance + /// bottleneck. Investigate using a BitVector to represent the set of + /// loans for improved join performance. + Result = Factory.LoanSetFact.add(Result, LID); +} +return Result; + } + + void dump(llvm::raw_ostream &OS) const { +OS << "LifetimeLattice State:\n"; +if (Origins.isEmpty()) + OS << " \n"; +for (const auto &Entry : Origins) { + if (Entry.second.isEmpty()) +OS
[llvm-branch-commits] [clang] [LifetimeSafety] Implement dataflow analysis for loan propagation (PR #147295)
https://github.com/usx95 edited https://github.com/llvm/llvm-project/pull/147295 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [LifetimeSafety] Implement dataflow analysis for loan propagation (PR #147295)
llvmbot wrote: @llvm/pr-subscribers-clang-analysis Author: Utkarsh Saxena (usx95) Changes This patch introduces the core dataflow analysis infrastructure for the C++ Lifetime Safety checker. This change implements the logic to propagate "loan" information across the control-flow graph. The primary goal is to compute a fixed-point state that accurately models which pointer (Origin) can hold which borrow (Loan) at any given program point. Key components * `LifetimeLattice`: Defines the dataflow state, mapping an `OriginID` to a `LoanSet` using `llvm::ImmutableMap`. * `Transferer`: Implements the transfer function, which updates the `LifetimeLattice` by applying the lifetime facts (Issue, AssignOrigin, etc.) generated for each basic block. * `LifetimeDataflow`: A forward dataflow analysis driver that uses a worklist algorithm to iterate over the CFG until the lattice state converges. The existing test suite has been extended to check the final dataflow results. This work is a prerequisite for the final step of the analysis: consuming these results to identify and report lifetime violations. --- Patch is 20.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147295.diff 2 Files Affected: - (modified) clang/lib/Analysis/LifetimeSafety.cpp (+257-1) - (modified) clang/test/Sema/warn-lifetime-safety-dataflow.cpp (+186) ``diff diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp index 2c2309de90e26..e881e592ef59f 100644 --- a/clang/lib/Analysis/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety.cpp @@ -13,7 +13,10 @@ #include "clang/Analysis/Analyses/PostOrderCFGView.h" #include "clang/Analysis/AnalysisDeclContext.h" #include "clang/Analysis/CFG.h" +#include "clang/Analysis/FlowSensitive/DataflowWorklist.h" #include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/ImmutableMap.h" +#include "llvm/ADT/ImmutableSet.h" #include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Debug.h" @@ -482,7 +485,247 @@ class FactGenerator : public ConstStmtVisitor { }; // = // -// TODO: Run dataflow analysis to propagate loans, analyse and error reporting. +// The Dataflow Lattice +// = // + +// Using LLVM's immutable collections is efficient for dataflow analysis +// as it avoids deep copies during state transitions. +// TODO(opt): Consider using a bitset to represent the set of loans. +using LoanSet = llvm::ImmutableSet; +using OriginLoanMap = llvm::ImmutableMap; + +/// An object to hold the factories for immutable collections, ensuring +/// that all created states share the same underlying memory management. +struct LifetimeFactory { + OriginLoanMap::Factory OriginMapFact; + LoanSet::Factory LoanSetFact; + + LoanSet createLoanSet(LoanID LID) { +return LoanSetFact.add(LoanSetFact.getEmptySet(), LID); + } +}; + +/// LifetimeLattice represents the state of our analysis at a given program +/// point. It is an immutable object, and all operations produce a new +/// instance rather than modifying the existing one. +struct LifetimeLattice { + /// The map from an origin to the set of loans it contains. + /// TODO(opt): To reduce the lattice size, propagate origins of declarations, + /// not expressions, because expressions are not visible across blocks. + OriginLoanMap Origins = OriginLoanMap(nullptr); + + explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {} + LifetimeLattice() = default; + + bool operator==(const LifetimeLattice &Other) const { +return Origins == Other.Origins; + } + bool operator!=(const LifetimeLattice &Other) const { +return !(*this == Other); + } + + LoanSet getLoans(OriginID OID, LifetimeFactory &Factory) const { +if (auto *Loans = Origins.lookup(OID)) + return *Loans; +return Factory.LoanSetFact.getEmptySet(); + } + + /// Computes the union of two lattices by performing a key-wise join of + /// their OriginLoanMaps. + // TODO(opt): This key-wise join is a performance bottleneck. A more + // efficient merge could be implemented using a Patricia Trie or HAMT + // instead of the current AVL-tree-based ImmutableMap. + LifetimeLattice join(const LifetimeLattice &Other, + LifetimeFactory &Factory) const { +/// Merge the smaller map into the larger one ensuring we iterate over the +/// smaller map. +if (Origins.getHeight() < Other.Origins.getHeight()) + return Other.join(*this, Factory); + +OriginLoanMap JoinedState = Origins; +// For each origin in the other map, union its loan set with ours. +for (const auto &Entry : Other.Origins) { + OriginID OID = Entry.first; + LoanSet OtherLoanSet = Entry.second; + JoinedState = Factory.OriginMapFact.add( +
[llvm-branch-commits] [clang] [LifetimeSafety] Implement dataflow analysis for loan propagation (PR #147295)
https://github.com/usx95 ready_for_review https://github.com/llvm/llvm-project/pull/147295 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Drive profile validator from opt (PR #147418)
https://github.com/mtrofin created https://github.com/llvm/llvm-project/pull/147418 None >From 03b555bc71ff35cee5b97a8c9d7883396d4d7f31 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Mon, 7 Jul 2025 12:44:41 -0700 Subject: [PATCH] Drive profile validator from opt --- llvm/tools/opt/NewPMDriver.cpp | 8 ++-- llvm/tools/opt/NewPMDriver.h | 2 +- llvm/tools/opt/optdriver.cpp | 7 ++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp index 7d168a6ceb17c..042ed027639bc 100644 --- a/llvm/tools/opt/NewPMDriver.cpp +++ b/llvm/tools/opt/NewPMDriver.cpp @@ -40,6 +40,7 @@ #include "llvm/Transforms/Instrumentation/AddressSanitizer.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/Debugify.h" +#include "llvm/Transforms/Utils/ProfileValidation.h" using namespace llvm; using namespace opt_tool; @@ -356,7 +357,7 @@ bool llvm::runPassPipeline( OutputKind OK, VerifierKind VK, bool ShouldPreserveAssemblyUseListOrder, bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex, bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve, -bool UnifiedLTO) { +bool EnableProfcheck, bool UnifiedLTO) { auto FS = vfs::getRealFileSystem(); std::optional P; switch (PGOKindFlag) { @@ -487,7 +488,8 @@ bool llvm::runPassPipeline( if (VerifyDIPreserve) MPM.addPass(NewPMDebugifyPass(DebugifyMode::OriginalDebugInfo, "", &DebugInfoBeforePass)); - + if (EnableProfcheck) +MPM.addPass(createModuleToFunctionPassAdaptor(ProfileInjectorPass())); // Add passes according to the -passes options. if (!PassPipeline.empty()) { if (auto Err = PB.parsePassPipeline(MPM, PassPipeline)) { @@ -504,6 +506,8 @@ bool llvm::runPassPipeline( MPM.addPass(NewPMCheckDebugifyPass( false, "", nullptr, DebugifyMode::OriginalDebugInfo, &DebugInfoBeforePass, VerifyDIPreserveExport)); + if (EnableProfcheck) +MPM.addPass(createModuleToFunctionPassAdaptor(ProfileVerifierPass())); // Add any relevant output pass at the end of the pipeline. switch (OK) { diff --git a/llvm/tools/opt/NewPMDriver.h b/llvm/tools/opt/NewPMDriver.h index 2daae571e72c2..6c21d6cae4e75 100644 --- a/llvm/tools/opt/NewPMDriver.h +++ b/llvm/tools/opt/NewPMDriver.h @@ -75,7 +75,7 @@ bool runPassPipeline( bool ShouldPreserveAssemblyUseListOrder, bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex, bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve, -bool UnifiedLTO = false); +bool EnableProfcheck, bool UnifiedLTO = false); } // namespace llvm #endif diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp index de46efa13025d..588110361466d 100644 --- a/llvm/tools/opt/optdriver.cpp +++ b/llvm/tools/opt/optdriver.cpp @@ -213,6 +213,10 @@ static cl::opt VerifyDebugInfoPreserve( cl::desc("Start the pipeline with collecting and end it with checking of " "debug info preservation.")); +static cl::opt EnableProfileVerification( +"enable-profcheck", cl::init(true), +cl::desc("Start the pipeline with prof-inject and end it with prof-check")); + static cl::opt ClDataLayout("data-layout", cl::desc("data layout string to use"), cl::value_desc("layout-string"), @@ -731,7 +735,8 @@ extern "C" int optMain( RemarksFile.get(), Pipeline, PluginList, PassBuilderCallbacks, OK, VK, PreserveAssemblyUseListOrder, PreserveBitcodeUseListOrder, EmitSummaryIndex, EmitModuleHash, - EnableDebugify, VerifyDebugInfoPreserve, UnifiedLTO) + EnableDebugify, VerifyDebugInfoPreserve, + EnableProfileVerification, UnifiedLTO) ? 0 : 1; } ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [mlir] [Flang][MLIR] Add `!$omp unroll` and `omp.unroll_heuristic` (PR #144785)
https://github.com/Meinersbur edited https://github.com/llvm/llvm-project/pull/144785 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [mlir] [Flang][MLIR] Add `!$omp unroll` and `omp.unroll_heuristic` (PR #144785)
https://github.com/Meinersbur ready_for_review https://github.com/llvm/llvm-project/pull/144785 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Triple: Record default exception handling type (PR #147225)
https://github.com/arsenm closed https://github.com/llvm/llvm-project/pull/147225 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [LifetimeSafety] Add script performance benchmarking (PR #147315)
https://github.com/usx95 edited https://github.com/llvm/llvm-project/pull/147315 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [WebAssembly] Fix inline assembly with vector types (#146574) (PR #147409)
https://github.com/llvmbot milestoned https://github.com/llvm/llvm-project/pull/147409 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [WebAssembly] Fix inline assembly with vector types (#146574) (PR #147409)
llvmbot wrote: @sunfishcode What do you think about merging this PR to the release branch? https://github.com/llvm/llvm-project/pull/147409 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [WebAssembly] Fix inline assembly with vector types (#146574) (PR #147409)
https://github.com/sunfishcode approved this pull request. https://github.com/llvm/llvm-project/pull/147409 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [WebAssembly] Fix inline assembly with vector types (#146574) (PR #147409)
llvmbot wrote: @llvm/pr-subscribers-backend-webassembly Author: None (llvmbot) Changes Backport a8a9a7f Requested by: @sunfishcode --- Full diff: https://github.com/llvm/llvm-project/pull/147409.diff 2 Files Affected: - (modified) llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td (+2-2) - (modified) llvm/test/CodeGen/WebAssembly/inline-asm.ll (+11) ``diff diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td index 17889dacc868c..31a33c1e7365b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td @@ -64,8 +64,8 @@ def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32, I32_0)>; def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64, I64_0)>; def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>; def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>; -def V128 : WebAssemblyRegClass<[v8f16, v4f32, v2f64, v2i64, v4i32, v16i8, -v8i16], +def V128 : WebAssemblyRegClass<[v2i64, v4i32, v16i8, v8i16, +v8f16, v4f32, v2f64], 128, (add V128_0)>; def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>; def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>; diff --git a/llvm/test/CodeGen/WebAssembly/inline-asm.ll b/llvm/test/CodeGen/WebAssembly/inline-asm.ll index 4462cfb7aa0c4..c378fd953a555 100644 --- a/llvm/test/CodeGen/WebAssembly/inline-asm.ll +++ b/llvm/test/CodeGen/WebAssembly/inline-asm.ll @@ -129,7 +129,18 @@ entry: ret i32 %ret } +; CHECK-LABEL: v128_load +; CHECK: local.get 0 +; CHECK-NEXT: v128.load 0 +; CHECK-NEXT: local.set 1 +define <4 x i32> @v128_load(ptr %v) #1 { +entry: + %0 = tail call <4 x i32> asm "local.get $1\0Av128.load 0\0Alocal.set $0", "=r,r"(ptr %v) + ret <4 x i32> %0 +} + attributes #0 = { nounwind } +attributes #1 = { "target-features"="+simd128" } !0 = !{i32 47} !1 = !{i32 145} `` https://github.com/llvm/llvm-project/pull/147409 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [WebAssembly] Fix inline assembly with vector types (#146574) (PR #147409)
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/147409 Backport a8a9a7f Requested by: @sunfishcode >From 9fd5816e48736cc51a118311e805d3e1f3758092 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 2 Jul 2025 05:26:30 +0200 Subject: [PATCH] [WebAssembly] Fix inline assembly with vector types (#146574) This commit fixes using inline assembly with v128 results. Previously this failed with an internal assertion about a failure to legalize a `CopyFromReg` where the source register was typed `v8f16`. It looks like the type used for the destination register was whatever was listed first in the `def V128 : WebAssemblyRegClass` listing, so the types were shuffled around to have a default-supported type. A small test was added as well which failed to generate previously and should now pass in generation. This test passed on LLVM 18 additionally and regressed by accident in #93228 which was first included in LLVM 19. (cherry picked from commit a8a9a7f95a695c02bdf3d5821d1c62cc8e08c2ff) --- .../lib/Target/WebAssembly/WebAssemblyRegisterInfo.td | 4 ++-- llvm/test/CodeGen/WebAssembly/inline-asm.ll | 11 +++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td index 17889dacc868c..31a33c1e7365b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td @@ -64,8 +64,8 @@ def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32, I32_0)>; def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64, I64_0)>; def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>; def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>; -def V128 : WebAssemblyRegClass<[v8f16, v4f32, v2f64, v2i64, v4i32, v16i8, -v8i16], +def V128 : WebAssemblyRegClass<[v2i64, v4i32, v16i8, v8i16, +v8f16, v4f32, v2f64], 128, (add V128_0)>; def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>; def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>; diff --git a/llvm/test/CodeGen/WebAssembly/inline-asm.ll b/llvm/test/CodeGen/WebAssembly/inline-asm.ll index 4462cfb7aa0c4..c378fd953a555 100644 --- a/llvm/test/CodeGen/WebAssembly/inline-asm.ll +++ b/llvm/test/CodeGen/WebAssembly/inline-asm.ll @@ -129,7 +129,18 @@ entry: ret i32 %ret } +; CHECK-LABEL: v128_load +; CHECK: local.get 0 +; CHECK-NEXT: v128.load 0 +; CHECK-NEXT: local.set 1 +define <4 x i32> @v128_load(ptr %v) #1 { +entry: + %0 = tail call <4 x i32> asm "local.get $1\0Av128.load 0\0Alocal.set $0", "=r,r"(ptr %v) + ret <4 x i32> %0 +} + attributes #0 = { nounwind } +attributes #1 = { "target-features"="+simd128" } !0 = !{i32 47} !1 = !{i32 145} ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] Users/usx95/lifetime safety benchmarking (PR #147315)
https://github.com/usx95 edited https://github.com/llvm/llvm-project/pull/147315 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Offload] Allow "tagging" device info entries with offload keys (PR #147317)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Ross Brunton (RossBrunton) Changes When generating the device info tree, nodes can be marked with an offload Device Info value. The nodes can also look up children based on this value. --- Full diff: https://github.com/llvm/llvm-project/pull/147317.diff 3 Files Affected: - (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+7-4) - (modified) offload/plugins-nextgen/common/include/PluginInterface.h (+24-3) - (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+5-3) ``diff diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 832c31c43b5d2..52ea3283b24ef 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2562,7 +2562,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor); if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS) Info.add("HSA Runtime Version", - std::to_string(Major) + "." + std::to_string(Minor)); + std::to_string(Major) + "." + std::to_string(Minor), "", + DeviceInfo::DRIVER_VERSION); Info.add("HSA OpenMP Device Number", DeviceId); @@ -2572,11 +2573,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) - Info.add("Device Name", TmpChar); + Info.add("Device Name", TmpChar, "", DeviceInfo::NAME); Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) - Info.add("Vendor Name", TmpChar); + Info.add("Vendor Name", TmpChar, "", DeviceInfo::VENDOR); hsa_device_type_t DevType; Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType); @@ -2652,7 +2653,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim); if (Status == HSA_STATUS_SUCCESS) { - auto &MaxSize = *Info.add("Workgroup Max Size per Dimension"); + auto &MaxSize = + *Info.add("Workgroup Max Size per Dimension", std::monostate{}, "", +DeviceInfo::MAX_WORK_GROUP_SIZE); MaxSize.add("x", WorkgrpMaxDim[0]); MaxSize.add("y", WorkgrpMaxDim[1]); MaxSize.add("z", WorkgrpMaxDim[2]); diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index b5addc13d6644..9dc01ca0277fe 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -113,6 +113,12 @@ struct AsyncInfoWrapperTy { __tgt_async_info *AsyncInfoPtr; }; +enum class DeviceInfo { +#define OFFLOAD_DEVINFO(Name, _, Value) Name = Value, +#include "OffloadInfo.inc" +#undef OFFLOAD_DEVINFO +}; + /// Tree node for device information /// /// This information is either printed or used by liboffload to extract certain @@ -133,6 +139,8 @@ struct InfoTreeNode { // * The same key can appear multiple times std::unique_ptr> Children; + std::map DeviceInfoMap; + InfoTreeNode() : InfoTreeNode("", std::monostate{}, "") {} InfoTreeNode(std::string Key, VariantType Value, std::string Units) : Key(Key), Value(Value), Units(Units) {} @@ -140,10 +148,12 @@ struct InfoTreeNode { /// Add a new info entry as a child of this node. The entry requires at least /// a key string in \p Key. The value in \p Value is optional and can be any /// type that is representable as a string. The units in \p Units is optional - /// and must be a string. + /// and must be a string. Providing a device info key allows liboffload to + /// use that value for an appropriate olGetDeviceInfo query template InfoTreeNode *add(std::string Key, T Value = T(), -const std::string &Units = std::string()) { +const std::string &Units = std::string(), +std::optional DeviceInfoKey = std::nullopt) { assert(!Key.empty() && "Invalid info key"); if (!Children) @@ -157,7 +167,12 @@ struct InfoTreeNode { else ValueVariant = std::string{Value}; -return &Children->emplace_back(Key, ValueVariant, Units); +auto Ptr = &Children->emplace_back(Key, ValueVariant, Units); + +if (DeviceInfoKey) + DeviceInfoMap[*DeviceInfoKey] = Children->size() - 1; + +return Ptr; } std::optional get(StringRef Key) { @@ -171,6 +186,12 @@ struct InfoTreeNode { return It; } + std::optional get(DeviceInfo Info) { +if (DeviceInfoMap.count(Info)) + return &(*Children)[DeviceInfoMap[Info]]; +return std::nullopt; + } + /// Print all info entries in the tree void print() const { // Fake an additional indent so that
[llvm-branch-commits] [clang] [LifetimeSafety] Add script performance benchmarking (PR #147315)
github-actions[bot] wrote: :warning: Python code formatter, darker found issues in your code. :warning: You can test this locally with the following command: ``bash darker --check --diff -r HEAD~1...HEAD clang/test/Analysis/lifetime_safety/benchmark.py `` View the diff from darker here. ``diff --- benchmark.py2025-07-07 15:13:00.00 + +++ benchmark.py2025-07-07 15:15:18.715309 + @@ -7,10 +7,11 @@ from datetime import datetime import numpy as np from scipy.optimize import curve_fit from scipy.stats import t + def generate_cpp_cycle_test(n: int) -> str: """ Generates a C++ code snippet with a specified number of pointers in a cycle. """ if n <= 0: @@ -32,10 +33,11 @@ cpp_code += f"p{n} = temp;\n" cpp_code += " }\n}\n" cpp_code += f"\nint main() {{ long_cycle_{n}(false); return 0; }}\n" return cpp_code + def generate_cpp_merge_test(n: int) -> str: """ Generates a C++ code snippet with N independent conditional assignments. """ if n <= 0: @@ -53,163 +55,188 @@ cpp_code += "}\n" cpp_code += f"\nint main() {{ conditional_merges_{n}(false); return 0; }}\n" return cpp_code + def analyze_trace_file(trace_path: str) -> tuple[float, float]: """ Parses the -ftime-trace JSON output to find durations. Returns: A tuple of (lifetime_analysis_duration_us, total_clang_duration_us). """ lifetime_duration = 0.0 total_duration = 0.0 try: -with open(trace_path, 'r') as f: +with open(trace_path, "r") as f: trace_data = json.load(f) -for event in trace_data.get('traceEvents', []): -if event.get('name') == 'LifetimeAnalysis': -lifetime_duration += float(event.get('dur', 0)) -if event.get('name') == 'ExecuteCompiler': -total_duration += float(event.get('dur', 0)) +for event in trace_data.get("traceEvents", []): +if event.get("name") == "LifetimeAnalysis": +lifetime_duration += float(event.get("dur", 0)) +if event.get("name") == "ExecuteCompiler": +total_duration += float(event.get("dur", 0)) except (IOError, json.JSONDecodeError) as e: print(f"Error reading or parsing trace file {trace_path}: {e}", file=sys.stderr) return 0.0, 0.0 return lifetime_duration, total_duration + def power_law(n, c, k): """Represents the power law function: y = c * n^k""" return c * np.power(n, k) + def human_readable_time(ms: float) -> str: """Converts milliseconds to a human-readable string (ms or s).""" if ms >= 1000: return f"{ms / 1000:.2f} s" return f"{ms:.2f} ms" + def generate_markdown_report(results: dict) -> str: """Generates a Markdown-formatted report from the benchmark results.""" report = [] timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z") report.append(f"# Lifetime Analysis Performance Report") report.append(f"> Generated on: {timestamp}") report.append("\n---\n") for test_type, data in results.items(): -title = 'Pointer Cycle in Loop' if test_type == 'cycle' else 'CFG Merges' +title = "Pointer Cycle in Loop" if test_type == "cycle" else "CFG Merges" report.append(f"## Test Case: {title}") report.append("") # Table header report.append("| N | Analysis Time | Total Clang Time |") report.append("|:|--:|-:|") # Table rows -n_data = np.array(data['n']) -analysis_data = np.array(data['lifetime_ms']) -total_data = np.array(data['total_ms']) +n_data = np.array(data["n"]) +analysis_data = np.array(data["lifetime_ms"]) +total_data = np.array(data["total_ms"]) for i in range(len(n_data)): analysis_str = human_readable_time(analysis_data[i]) total_str = human_readable_time(total_data[i]) report.append(f"| {n_data[i]:<3} | {analysis_str:>13} | {total_str:>16} |") report.append("") # Complexity analysis report.append(f"**Complexity Analysis:**") try: -popt, pcov = curve_fit(power_law, n_data, analysis_data, p0=[0, 2], maxfev=5000) +popt, pcov = curve_fit( +power_law, n_data, analysis_data, p0=[0, 2], maxfev=5000 +) _, k = popt - + # R-squared calculation residuals = analysis_data - power_law(n_data, *popt) ss_res = np.sum(residuals**2) -ss_tot = np.sum((analysis_data - np.mean(analysis_data))**2) +ss_tot = np.sum((analysis_data - np.mean(analysis_data)) ** 2) r_squared = 1 - (ss_res / ss_tot) - + # Confidence I
[llvm-branch-commits] [llvm] [Offload] Refactor device information queries to use new tagging (PR #147318)
https://github.com/RossBrunton created https://github.com/llvm/llvm-project/pull/147318 Instead using strings to look up device information (which is brittle and slow), use the new tags that the plugins specify when building the nodes. >From 4cce1eec173637a0e50655e10ad520a9821b9960 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Mon, 7 Jul 2025 16:13:32 +0100 Subject: [PATCH] [Offload] Refactor device information queries to use new tagging Instead using strings to look up device information (which is brittle and slow), use the new tags that the plugins specify when building the nodes. --- offload/liboffload/src/Helpers.hpp | 19 ++--- offload/liboffload/src/OffloadImpl.cpp | 111 +++-- 2 files changed, 54 insertions(+), 76 deletions(-) diff --git a/offload/liboffload/src/Helpers.hpp b/offload/liboffload/src/Helpers.hpp index 8b85945508b98..62e55e500fac7 100644 --- a/offload/liboffload/src/Helpers.hpp +++ b/offload/liboffload/src/Helpers.hpp @@ -75,23 +75,16 @@ class InfoWriter { InfoWriter(InfoWriter &) = delete; ~InfoWriter() = default; - template llvm::Error write(llvm::Expected &&Val) { -if (Val) - return getInfo(Size, Target, SizeRet, *Val); -return Val.takeError(); + template llvm::Error write(T Val) { +return getInfo(Size, Target, SizeRet, Val); } - template - llvm::Error writeArray(llvm::Expected &&Val, size_t Elems) { -if (Val) - return getInfoArray(Elems, Size, Target, SizeRet, *Val); -return Val.takeError(); + template llvm::Error writeArray(T Val, size_t Elems) { +return getInfoArray(Elems, Size, Target, SizeRet, Val); } - llvm::Error writeString(llvm::Expected &&Val) { -if (Val) - return getInfoString(Size, Target, SizeRet, *Val); -return Val.takeError(); + llvm::Error writeString(llvm::StringRef Val) { +return getInfoString(Size, Target, SizeRet, Val); } private: diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index f9da638436705..c84bf01460252 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -286,78 +286,63 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, return Plugin::error(ErrorCode::UNIMPLEMENTED, ErrBuffer.c_str()); }; - // Find the info if it exists under any of the given names - auto getInfoString = - [&](std::vector Names) -> llvm::Expected { -for (auto &Name : Names) { - if (auto Entry = Device->Info.get(Name)) { -if (!std::holds_alternative((*Entry)->Value)) - return makeError(ErrorCode::BACKEND_FAILURE, - "plugin returned incorrect type"); -return std::get((*Entry)->Value).c_str(); - } -} - -return makeError(ErrorCode::UNIMPLEMENTED, - "plugin did not provide a response for this information"); - }; - - auto getInfoXyz = - [&](std::vector Names) -> llvm::Expected { -for (auto &Name : Names) { - if (auto Entry = Device->Info.get(Name)) { -auto Node = *Entry; -ol_dimensions_t Out{0, 0, 0}; - -auto getField = [&](StringRef Name, uint32_t &Dest) { - if (auto F = Node->get(Name)) { -if (!std::holds_alternative((*F)->Value)) - return makeError( - ErrorCode::BACKEND_FAILURE, - "plugin returned incorrect type for dimensions element"); -Dest = std::get((*F)->Value); - } else -return makeError(ErrorCode::BACKEND_FAILURE, - "plugin didn't provide all values for dimensions"); - return Plugin::success(); -}; - -if (auto Res = getField("x", Out.x)) - return Res; -if (auto Res = getField("y", Out.y)) - return Res; -if (auto Res = getField("z", Out.z)) - return Res; - -return Out; - } -} + // These are not implemented by the plugin interface + if (PropName == OL_DEVICE_INFO_PLATFORM) +return Info.write(Device->Platform); + if (PropName == OL_DEVICE_INFO_TYPE) +return Info.write(OL_DEVICE_TYPE_GPU); + // TODO: Update when https://github.com/llvm/llvm-project/pull/147314 is merged + if (PropName > OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE) +return createOffloadError(ErrorCode::INVALID_ENUMERATION, + "getDeviceInfo enum '%i' is invalid", PropName); + auto EntryOpt = Device->Info.get(static_cast(PropName)); + if (!EntryOpt) return makeError(ErrorCode::UNIMPLEMENTED, "plugin did not provide a response for this information"); - }; + auto Entry = *EntryOpt; switch (PropName) { - case OL_DEVICE_INFO_PLATFORM: -return Info.write(Device->Platform); - case OL_DEVICE_INFO_TYPE: -return Info.write(OL_DEVICE_TYPE_GPU); case OL_DEVICE_INFO_NAME: -return Info.writeString(getInfoString({"Device Name"})); case OL_DEVICE_INFO_VENDOR:
[llvm-branch-commits] [llvm] [Offload] Allow "tagging" device info entries with offload keys (PR #147317)
@@ -133,17 +139,21 @@ struct InfoTreeNode { // * The same key can appear multiple times std::unique_ptr> Children; + std::map DeviceInfoMap; jhuber6 wrote: Do these need to be sorted? Otherwise a dense map is more efficient. https://github.com/llvm/llvm-project/pull/147317 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Offload] Allow "tagging" device info entries with offload keys (PR #147317)
@@ -171,6 +186,12 @@ struct InfoTreeNode { return It; } + std::optional get(DeviceInfo Info) { +if (DeviceInfoMap.count(Info)) + return &(*Children)[DeviceInfoMap[Info]]; +return std::nullopt; jhuber6 wrote: ```suggestion return !DeviceInfoMap.count(Info) std::nullopt : &(*Children)[DeviceInfoMap[Info]]; ``` https://github.com/llvm/llvm-project/pull/147317 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Offload] Refactor device information queries to use new tagging (PR #147318)
llvmbot wrote: @llvm/pr-subscribers-offload Author: Ross Brunton (RossBrunton) Changes Instead using strings to look up device information (which is brittle and slow), use the new tags that the plugins specify when building the nodes. --- Full diff: https://github.com/llvm/llvm-project/pull/147318.diff 2 Files Affected: - (modified) offload/liboffload/src/Helpers.hpp (+6-13) - (modified) offload/liboffload/src/OffloadImpl.cpp (+48-63) ``diff diff --git a/offload/liboffload/src/Helpers.hpp b/offload/liboffload/src/Helpers.hpp index 8b85945508b98..62e55e500fac7 100644 --- a/offload/liboffload/src/Helpers.hpp +++ b/offload/liboffload/src/Helpers.hpp @@ -75,23 +75,16 @@ class InfoWriter { InfoWriter(InfoWriter &) = delete; ~InfoWriter() = default; - template llvm::Error write(llvm::Expected &&Val) { -if (Val) - return getInfo(Size, Target, SizeRet, *Val); -return Val.takeError(); + template llvm::Error write(T Val) { +return getInfo(Size, Target, SizeRet, Val); } - template - llvm::Error writeArray(llvm::Expected &&Val, size_t Elems) { -if (Val) - return getInfoArray(Elems, Size, Target, SizeRet, *Val); -return Val.takeError(); + template llvm::Error writeArray(T Val, size_t Elems) { +return getInfoArray(Elems, Size, Target, SizeRet, Val); } - llvm::Error writeString(llvm::Expected &&Val) { -if (Val) - return getInfoString(Size, Target, SizeRet, *Val); -return Val.takeError(); + llvm::Error writeString(llvm::StringRef Val) { +return getInfoString(Size, Target, SizeRet, Val); } private: diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index f9da638436705..c84bf01460252 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -286,78 +286,63 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, return Plugin::error(ErrorCode::UNIMPLEMENTED, ErrBuffer.c_str()); }; - // Find the info if it exists under any of the given names - auto getInfoString = - [&](std::vector Names) -> llvm::Expected { -for (auto &Name : Names) { - if (auto Entry = Device->Info.get(Name)) { -if (!std::holds_alternative((*Entry)->Value)) - return makeError(ErrorCode::BACKEND_FAILURE, - "plugin returned incorrect type"); -return std::get((*Entry)->Value).c_str(); - } -} - -return makeError(ErrorCode::UNIMPLEMENTED, - "plugin did not provide a response for this information"); - }; - - auto getInfoXyz = - [&](std::vector Names) -> llvm::Expected { -for (auto &Name : Names) { - if (auto Entry = Device->Info.get(Name)) { -auto Node = *Entry; -ol_dimensions_t Out{0, 0, 0}; - -auto getField = [&](StringRef Name, uint32_t &Dest) { - if (auto F = Node->get(Name)) { -if (!std::holds_alternative((*F)->Value)) - return makeError( - ErrorCode::BACKEND_FAILURE, - "plugin returned incorrect type for dimensions element"); -Dest = std::get((*F)->Value); - } else -return makeError(ErrorCode::BACKEND_FAILURE, - "plugin didn't provide all values for dimensions"); - return Plugin::success(); -}; - -if (auto Res = getField("x", Out.x)) - return Res; -if (auto Res = getField("y", Out.y)) - return Res; -if (auto Res = getField("z", Out.z)) - return Res; - -return Out; - } -} + // These are not implemented by the plugin interface + if (PropName == OL_DEVICE_INFO_PLATFORM) +return Info.write(Device->Platform); + if (PropName == OL_DEVICE_INFO_TYPE) +return Info.write(OL_DEVICE_TYPE_GPU); + // TODO: Update when https://github.com/llvm/llvm-project/pull/147314 is merged + if (PropName > OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE) +return createOffloadError(ErrorCode::INVALID_ENUMERATION, + "getDeviceInfo enum '%i' is invalid", PropName); + auto EntryOpt = Device->Info.get(static_cast(PropName)); + if (!EntryOpt) return makeError(ErrorCode::UNIMPLEMENTED, "plugin did not provide a response for this information"); - }; + auto Entry = *EntryOpt; switch (PropName) { - case OL_DEVICE_INFO_PLATFORM: -return Info.write(Device->Platform); - case OL_DEVICE_INFO_TYPE: -return Info.write(OL_DEVICE_TYPE_GPU); case OL_DEVICE_INFO_NAME: -return Info.writeString(getInfoString({"Device Name"})); case OL_DEVICE_INFO_VENDOR: -return Info.writeString(getInfoString({"Vendor Name"})); - case OL_DEVICE_INFO_DRIVER_VERSION: -return Info.writeString( -getInfoString({"CUDA Driver Version", "HSA Runtime Version"})); - case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE: -return Info.write(getInfoXyz({"Workgroup Max Size per Dim
[llvm-branch-commits] [llvm] [Offload] Allow "tagging" device info entries with offload keys (PR #147317)
@@ -171,6 +186,12 @@ struct InfoTreeNode { return It; } + std::optional get(DeviceInfo Info) { +if (DeviceInfoMap.count(Info)) + return &(*Children)[DeviceInfoMap[Info]]; +return std::nullopt; arsenm wrote: This is still a double map lookup, do one find https://github.com/llvm/llvm-project/pull/147317 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Offload] Refactor device information queries to use new tagging (PR #147318)
github-actions[bot] wrote: :warning: C/C++ code formatter, clang-format found issues in your code. :warning: You can test this locally with the following command: ``bash git-clang-format --diff HEAD~1 HEAD --extensions hpp,cpp -- offload/liboffload/src/Helpers.hpp offload/liboffload/src/OffloadImpl.cpp `` View the diff from clang-format here. ``diff diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index c84bf0146..4ca32d2e0 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -291,7 +291,8 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, return Info.write(Device->Platform); if (PropName == OL_DEVICE_INFO_TYPE) return Info.write(OL_DEVICE_TYPE_GPU); - // TODO: Update when https://github.com/llvm/llvm-project/pull/147314 is merged + // TODO: Update when https://github.com/llvm/llvm-project/pull/147314 is + // merged if (PropName > OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE) return createOffloadError(ErrorCode::INVALID_ENUMERATION, "getDeviceInfo enum '%i' is invalid", PropName); `` https://github.com/llvm/llvm-project/pull/147318 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [LifetimeSafety] Add script performance benchmarking (PR #147315)
https://github.com/usx95 updated https://github.com/llvm/llvm-project/pull/147315 >From 0fbfd74d23b6cd26ef0480f7b9061b2f4a745338 Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Mon, 7 Jul 2025 15:13:00 + Subject: [PATCH 1/2] [LifetimeSafety] Add script performance benchmarking --- clang/lib/Analysis/LifetimeSafety.cpp | 7 +- .../Analysis/lifetime_safety/benchmark.py | 215 ++ 2 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 clang/test/Analysis/lifetime_safety/benchmark.py diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp index e881e592ef59f..1c83b5051bad1 100644 --- a/clang/lib/Analysis/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety.cpp @@ -151,7 +151,12 @@ class OriginManager { OriginID get(const ValueDecl &D) { auto It = DeclToOriginID.find(&D); -assert(It != DeclToOriginID.end()); +// TODO: This should be an assert(It != ExprToOriginID.end()). The current +// implementation falls back to getOrCreate to avoid crashing on +// yet-unhandled pointer expressions, creating an empty origin for them. +if (It == DeclToOriginID.end()) + return getOrCreate(D); + return It->second; } diff --git a/clang/test/Analysis/lifetime_safety/benchmark.py b/clang/test/Analysis/lifetime_safety/benchmark.py new file mode 100644 index 0..ddf32e192de17 --- /dev/null +++ b/clang/test/Analysis/lifetime_safety/benchmark.py @@ -0,0 +1,215 @@ +import sys +import argparse +import subprocess +import tempfile +import json +import os +from datetime import datetime +import numpy as np +from scipy.optimize import curve_fit +from scipy.stats import t + +def generate_cpp_cycle_test(n: int) -> str: +""" +Generates a C++ code snippet with a specified number of pointers in a cycle. +""" +if n <= 0: +return "// Number of variables must be positive." + +cpp_code = "struct MyObj { int id; ~MyObj() {} };\n\n" +cpp_code += f"void long_cycle_{n}(bool condition) {{\n" +for i in range(1, n + 1): +cpp_code += f" MyObj v{i}{{1}};\n" +cpp_code += "\n" +for i in range(1, n + 1): +cpp_code += f" MyObj* p{i} = &v{i};\n" + +cpp_code += "\n while (condition) {\n" +if n > 0: +cpp_code += f"MyObj* temp = p1;\n" +for i in range(1, n): +cpp_code += f"p{i} = p{i+1};\n" +cpp_code += f"p{n} = temp;\n" +cpp_code += " }\n}\n" +cpp_code += f"\nint main() {{ long_cycle_{n}(false); return 0; }}\n" +return cpp_code + +def generate_cpp_merge_test(n: int) -> str: +""" +Generates a C++ code snippet with N independent conditional assignments. +""" +if n <= 0: +return "// Number of variables must be positive." + +cpp_code = "struct MyObj { int id; ~MyObj() {} };\n\n" +cpp_code += f"void conditional_merges_{n}(bool condition) {{\n" +decls = [f"v{i}" for i in range(1, n + 1)] +cpp_code += f" MyObj {', '.join(decls)};\n" +ptr_decls = [f"*p{i} = nullptr" for i in range(1, n + 1)] +cpp_code += f" MyObj {', '.join(ptr_decls)};\n\n" + +for i in range(1, n + 1): +cpp_code += f" if(condition) {{ p{i} = &v{i}; }}\n" + +cpp_code += "}\n" +cpp_code += f"\nint main() {{ conditional_merges_{n}(false); return 0; }}\n" +return cpp_code + +def analyze_trace_file(trace_path: str) -> tuple[float, float]: +""" +Parses the -ftime-trace JSON output to find durations. + +Returns: +A tuple of (lifetime_analysis_duration_us, total_clang_duration_us). +""" +lifetime_duration = 0.0 +total_duration = 0.0 +try: +with open(trace_path, 'r') as f: +trace_data = json.load(f) +for event in trace_data.get('traceEvents', []): +if event.get('name') == 'LifetimeAnalysis': +lifetime_duration += float(event.get('dur', 0)) +if event.get('name') == 'ExecuteCompiler': +total_duration += float(event.get('dur', 0)) + +except (IOError, json.JSONDecodeError) as e: +print(f"Error reading or parsing trace file {trace_path}: {e}", file=sys.stderr) +return 0.0, 0.0 +return lifetime_duration, total_duration + +def power_law(n, c, k): +"""Represents the power law function: y = c * n^k""" +return c * np.power(n, k) + +def human_readable_time(ms: float) -> str: +"""Converts milliseconds to a human-readable string (ms or s).""" +if ms >= 1000: +return f"{ms / 1000:.2f} s" +return f"{ms:.2f} ms" + +def generate_markdown_report(results: dict) -> str: +"""Generates a Markdown-formatted report from the benchmark results.""" +report = [] +timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z") +report.append(f"# Lifetime Analysis Performance Report") +report.append(f"> Generated on: {timestamp}") +report.append("\n---\n") + +for test_typ
[llvm-branch-commits] [clang] [LifetimeSafety] Add script for performance benchmarking (PR #147315)
https://github.com/usx95 edited https://github.com/llvm/llvm-project/pull/147315 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Offload] Allow "tagging" device info entries with offload keys (PR #147317)
https://github.com/RossBrunton created https://github.com/llvm/llvm-project/pull/147317 When generating the device info tree, nodes can be marked with an offload Device Info value. The nodes can also look up children based on this value. >From 9b79557e7a536ccd4b02365c9dd98a4ef69f87e1 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Mon, 7 Jul 2025 16:10:19 +0100 Subject: [PATCH] [Offload] Allow "tagging" device info entries with offload keys When generating the device info tree, nodes can be marked with an offload Device Info value. The nodes can also look up children based on this value. --- offload/plugins-nextgen/amdgpu/src/rtl.cpp| 11 +--- .../common/include/PluginInterface.h | 27 --- offload/plugins-nextgen/cuda/src/rtl.cpp | 8 +++--- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 832c31c43b5d2..52ea3283b24ef 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2562,7 +2562,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor); if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS) Info.add("HSA Runtime Version", - std::to_string(Major) + "." + std::to_string(Minor)); + std::to_string(Major) + "." + std::to_string(Minor), "", + DeviceInfo::DRIVER_VERSION); Info.add("HSA OpenMP Device Number", DeviceId); @@ -2572,11 +2573,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) - Info.add("Device Name", TmpChar); + Info.add("Device Name", TmpChar, "", DeviceInfo::NAME); Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) - Info.add("Vendor Name", TmpChar); + Info.add("Vendor Name", TmpChar, "", DeviceInfo::VENDOR); hsa_device_type_t DevType; Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType); @@ -2652,7 +2653,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim); if (Status == HSA_STATUS_SUCCESS) { - auto &MaxSize = *Info.add("Workgroup Max Size per Dimension"); + auto &MaxSize = + *Info.add("Workgroup Max Size per Dimension", std::monostate{}, "", +DeviceInfo::MAX_WORK_GROUP_SIZE); MaxSize.add("x", WorkgrpMaxDim[0]); MaxSize.add("y", WorkgrpMaxDim[1]); MaxSize.add("z", WorkgrpMaxDim[2]); diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index b5addc13d6644..9dc01ca0277fe 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -113,6 +113,12 @@ struct AsyncInfoWrapperTy { __tgt_async_info *AsyncInfoPtr; }; +enum class DeviceInfo { +#define OFFLOAD_DEVINFO(Name, _, Value) Name = Value, +#include "OffloadInfo.inc" +#undef OFFLOAD_DEVINFO +}; + /// Tree node for device information /// /// This information is either printed or used by liboffload to extract certain @@ -133,6 +139,8 @@ struct InfoTreeNode { // * The same key can appear multiple times std::unique_ptr> Children; + std::map DeviceInfoMap; + InfoTreeNode() : InfoTreeNode("", std::monostate{}, "") {} InfoTreeNode(std::string Key, VariantType Value, std::string Units) : Key(Key), Value(Value), Units(Units) {} @@ -140,10 +148,12 @@ struct InfoTreeNode { /// Add a new info entry as a child of this node. The entry requires at least /// a key string in \p Key. The value in \p Value is optional and can be any /// type that is representable as a string. The units in \p Units is optional - /// and must be a string. + /// and must be a string. Providing a device info key allows liboffload to + /// use that value for an appropriate olGetDeviceInfo query template InfoTreeNode *add(std::string Key, T Value = T(), -const std::string &Units = std::string()) { +const std::string &Units = std::string(), +std::optional DeviceInfoKey = std::nullopt) { assert(!Key.empty() && "Invalid info key"); if (!Children) @@ -157,7 +167,12 @@ struct InfoTreeNode { else ValueVariant = std::string{Value}; -return &Children->emplace_back(Key, ValueVariant, Units); +auto Ptr = &Children->emplace_back(Key, ValueVariant, Units); + +if (DeviceInfoKey) + DeviceInfoMap[*DeviceInfoKey] = Children->size() - 1; + +return Ptr; } std::optional get(StringRef Key) { @@ -171,6 +186,12 @@ struct InfoTreeNode {
[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)
@@ -784,8 +785,8 @@ AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { assert((!MF.getSubtarget().hasSVE() || AFI->hasCalculatedStackSizeSVE()) && "Expected SVE area to be calculated by this point"); - return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeSVE() && - !AFI->hasStackHazardSlotIndex(); + return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeZPR() && + !AFI->getStackSizePPR() && !AFI->hasStackHazardSlotIndex(); sdesmalen-arm wrote: nit: ```suggestion return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->hasSVEStackSize() && !AFI->hasStackHazardSlotIndex(); ``` https://github.com/llvm/llvm-project/pull/142391 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)
@@ -299,14 +297,20 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { TailCallReservedStack = bytes; } - bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } + void setStackSizeZPR(uint64_t S) { +HasCalculatedStackSizeSVE = true; sdesmalen-arm wrote: nit: this function sets `HasCalculatedStackSizeSVE` if only one of the two values are set. Is it worth making this `setStackSizeSVE(uint64_t ZPR, uint64_t PPR=0)` such that `HasCalculatedStackSizeSVE is set only once? https://github.com/llvm/llvm-project/pull/142391 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)
@@ -299,14 +297,20 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { TailCallReservedStack = bytes; } - bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } + void setStackSizeZPR(uint64_t S) { +HasCalculatedStackSizeSVE = true; +StackSizeZPR = S; + } - void setStackSizeSVE(uint64_t S) { + void setStackSizePPR(uint64_t S) { HasCalculatedStackSizeSVE = true; -StackSizeSVE = S; +StackSizePPR = S; } - uint64_t getStackSizeSVE() const { return StackSizeSVE; } + uint64_t getStackSizeZPR() const { return StackSizeZPR; } sdesmalen-arm wrote: not related to your PR, but I think we should add an assert that `HasCalculatedStackSizeSVE` is true (same for CalleeSavedStackSize), although unfortunately that currently leads to some failures where they're used. https://github.com/llvm/llvm-project/pull/142391 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)
@@ -451,10 +454,36 @@ static unsigned getFixedObjectSize(const MachineFunction &MF, } } -/// Returns the size of the entire SVE stackframe (calleesaves + spills). +static unsigned getStackHazardSize(const MachineFunction &MF) { sdesmalen-arm wrote: nit: maybe just move the implementation to where they are declared? https://github.com/llvm/llvm-project/pull/142391 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)
@@ -19,6 +19,11 @@ namespace llvm { +struct SVEStackSizes { sdesmalen-arm wrote: Should this be named `SVEStackOffsets` (given that they're used as signed offsets)? https://github.com/llvm/llvm-project/pull/142391 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)
@@ -644,7 +644,8 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { if (ST.hasSVE() || ST.isStreaming()) { // Frames that have variable sized objects and scalable SVE objects, // should always use a basepointer. - if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE()) + if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeZPR() || + AFI->getStackSizePPR()) sdesmalen-arm wrote: nit: ```suggestion if (!AFI->hasCalculatedStackSizeSVE() || AFI->hasSVEStackSize()) ``` https://github.com/llvm/llvm-project/pull/142391 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)
@@ -1605,25 +1634,19 @@ static bool isTargetWindows(const MachineFunction &MF) { return MF.getSubtarget().isTargetWindows(); } -static unsigned getStackHazardSize(const MachineFunction &MF) { - return MF.getSubtarget().getStreamingHazardSize(); -} - // Convenience function to determine whether I is an SVE callee save. -static bool IsSVECalleeSave(MachineBasicBlock::iterator I) { +static bool IsZPRCalleeSave(MachineBasicBlock::iterator I) { sdesmalen-arm wrote: nit: given that you're renaming these, what about calling them `isPartOfZPRCalleeSave` (because a `PTRUE_B` instruction is not a callee-save in itself) https://github.com/llvm/llvm-project/pull/142391 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)
@@ -4294,24 +4396,32 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, report_fatal_error( "Alignment of scalable vectors > 16 bytes is not yet supported"); +int64_t &Offset = OffsetForObject(FI, ZPROffset, PPROffset); Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment); if (AssignOffsets) Assign(FI, -Offset); } - return Offset; + PPROffset = alignTo(PPROffset, Align(16U)); + ZPROffset = alignTo(ZPROffset, Align(16U)); + + if (&ZPROffset != &PPROffset) { +// SplitSVEObjects (PPRs and ZPRs allocated to separate areas). +return SVEStackSizes{ZPROffset, PPROffset}; + } + // When SplitSVEObjects is disabled just attribute all the stack to ZPRs. + // Determining the split is not necessary. + return SVEStackSizes{ZPROffset, 0}; sdesmalen-arm wrote: When you use an instance of the return type (`SVEStackSizes`) instead of `ZPRStack` and `PPRStack`, then you can just return that struct at the end of this function. https://github.com/llvm/llvm-project/pull/142391 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Prepare for split ZPR and PPR area allocation (NFCI) (PR #142391)
@@ -4227,10 +4310,20 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, // Fills in the first and last callee-saved frame indices into // Min/MaxCSFrameIndex, respectively. // Returns the size of the stack. -static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, - int &MinCSFrameIndex, - int &MaxCSFrameIndex, - bool AssignOffsets) { +static SVEStackSizes +determineSVEStackObjectOffsets(MachineFunction &MF, bool AssignOffsets, + bool SplitSVEObjects = false) { + MachineFrameInfo &MFI = MF.getFrameInfo(); + + int64_t ZPRStack = 0; + int64_t PPRStack = 0; + + auto [ZPROffset, PPROffset] = [&] { +if (SplitSVEObjects) + return std::tie(ZPRStack, PPRStack); +return std::tie(ZPRStack, ZPRStack); + }(); sdesmalen-arm wrote: This seems a lot more readable: ```suggestion int64_t &ZPROffset = ZPRStack; int64_t &PPROffset = SplitSVEObjects ? PPRStack : ZPRStack; ``` Also, can you add a brief comment describing why you create two aliases? https://github.com/llvm/llvm-project/pull/142391 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [HLSL][RootSignature] Implement diagnostic for missed comma (PR #147350)
inbelic wrote: Contemplating if I should split this into two prs. Will see if there is a nice way to de-couple the improve and fix error portions of this. https://github.com/llvm/llvm-project/pull/147350 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [HLSL][RootSignature] Implement diagnostic for missed comma (PR #147350)
https://github.com/inbelic converted_to_draft https://github.com/llvm/llvm-project/pull/147350 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LV] Bundle sub reductions into VPExpressionRecipe (PR #147255)
https://github.com/SamTebbs33 created https://github.com/llvm/llvm-project/pull/147255 This PR bundles sub reductions into the VPExpressionRecipe class and adjusts the cost functions to take the negation into account. >From 1a5f4e42e4f9d1eae0222302dcabdf08492f67c3 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Mon, 30 Jun 2025 14:29:54 +0100 Subject: [PATCH] [LV] Bundle sub reductions into VPExpressionRecipe This PR bundles sub reductions into the VPExpressionRecipe class and adjusts the cost functions to take the negation into account. --- .../llvm/Analysis/TargetTransformInfo.h | 4 +- .../llvm/Analysis/TargetTransformInfoImpl.h | 2 +- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 3 + llvm/lib/Analysis/TargetTransformInfo.cpp | 5 +- .../AArch64/AArch64TargetTransformInfo.cpp| 7 +- .../AArch64/AArch64TargetTransformInfo.h | 2 +- .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 7 +- llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 1 + .../Transforms/Vectorize/LoopVectorize.cpp| 6 +- llvm/lib/Transforms/Vectorize/VPlan.h | 11 ++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 35 - .../Transforms/Vectorize/VPlanTransforms.cpp | 33 ++-- .../Transforms/Vectorize/VectorCombine.cpp| 4 +- .../vplan-printing-reductions.ll | 143 ++ 14 files changed, 236 insertions(+), 27 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index c43870392361d..3cc0ea01953c3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1645,8 +1645,10 @@ class TargetTransformInfo { /// extensions. This is the cost of as: /// ResTy vecreduce.add(mul (A, B)). /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). + /// The multiply can optionally be negated, which signifies that it is a sub + /// reduction. LLVM_ABI InstructionCost getMulAccReductionCost( - bool IsUnsigned, Type *ResTy, VectorType *Ty, + bool IsUnsigned, Type *ResTy, VectorType *Ty, bool Negated, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; /// Calculate the cost of an extended reduction pattern, similar to diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 12f87226c5f57..fd22981a5dbf3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -960,7 +960,7 @@ class TargetTransformInfoImplBase { virtual InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, - TTI::TargetCostKind CostKind) const { + bool Negated, TTI::TargetCostKind CostKind) const { return 1; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index bf958e100f2ac..a9c9fa6d1db0d 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -3116,7 +3116,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, + bool Negated, TTI::TargetCostKind CostKind) const override { +if (Negated) + return InstructionCost::getInvalid(CostKind); // Without any native support, this is equivalent to the cost of // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or // vecreduce.add(mul(A, B)). diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 3ebd9d487ba04..ba0d070bffe6d 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1274,9 +1274,10 @@ InstructionCost TargetTransformInfo::getExtendedReductionCost( } InstructionCost TargetTransformInfo::getMulAccReductionCost( -bool IsUnsigned, Type *ResTy, VectorType *Ty, +bool IsUnsigned, Type *ResTy, VectorType *Ty, bool Negated, TTI::TargetCostKind CostKind) const { - return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, CostKind); + return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, Negated, + CostKind); } InstructionCost diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 380faa6cf6939..d9a367535baf4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5316,8 +5316,10 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost( InstructionCost AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, - VectorType *VecTy, + VectorType *VecTy, bo
[llvm-branch-commits] [llvm] [LV] Bundle sub reductions into VPExpressionRecipe (PR #147255)
llvmbot wrote: @llvm/pr-subscribers-backend-arm Author: Sam Tebbs (SamTebbs33) Changes This PR bundles sub reductions into the VPExpressionRecipe class and adjusts the cost functions to take the negation into account. --- Patch is 23.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147255.diff 14 Files Affected: - (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+3-1) - (modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+1-1) - (modified) llvm/include/llvm/CodeGen/BasicTTIImpl.h (+3) - (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+3-2) - (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+5-2) - (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h (+1-1) - (modified) llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp (+5-2) - (modified) llvm/lib/Target/ARM/ARMTargetTransformInfo.h (+1) - (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+3-3) - (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+11) - (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+32-3) - (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+23-10) - (modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+2-2) - (modified) llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll (+143) ``diff diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index c43870392361d..3cc0ea01953c3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1645,8 +1645,10 @@ class TargetTransformInfo { /// extensions. This is the cost of as: /// ResTy vecreduce.add(mul (A, B)). /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). + /// The multiply can optionally be negated, which signifies that it is a sub + /// reduction. LLVM_ABI InstructionCost getMulAccReductionCost( - bool IsUnsigned, Type *ResTy, VectorType *Ty, + bool IsUnsigned, Type *ResTy, VectorType *Ty, bool Negated, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; /// Calculate the cost of an extended reduction pattern, similar to diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 12f87226c5f57..fd22981a5dbf3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -960,7 +960,7 @@ class TargetTransformInfoImplBase { virtual InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, - TTI::TargetCostKind CostKind) const { + bool Negated, TTI::TargetCostKind CostKind) const { return 1; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index bf958e100f2ac..a9c9fa6d1db0d 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -3116,7 +3116,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, + bool Negated, TTI::TargetCostKind CostKind) const override { +if (Negated) + return InstructionCost::getInvalid(CostKind); // Without any native support, this is equivalent to the cost of // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or // vecreduce.add(mul(A, B)). diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 3ebd9d487ba04..ba0d070bffe6d 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1274,9 +1274,10 @@ InstructionCost TargetTransformInfo::getExtendedReductionCost( } InstructionCost TargetTransformInfo::getMulAccReductionCost( -bool IsUnsigned, Type *ResTy, VectorType *Ty, +bool IsUnsigned, Type *ResTy, VectorType *Ty, bool Negated, TTI::TargetCostKind CostKind) const { - return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, CostKind); + return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, Negated, + CostKind); } InstructionCost diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 380faa6cf6939..d9a367535baf4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5316,8 +5316,10 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost( InstructionCost AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, - VectorType *VecTy, + VectorType *VecTy, bool Negated,