[llvm-branch-commits] [llvm] [AMDGPU] Add SubtargetFeature for dynamic VGPR mode (PR #130030)
@@ -1239,6 +1239,12 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts", "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32" >; +def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr", shiltian wrote: Where is this target feature enabled? https://github.com/llvm/llvm-project/pull/130030 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] PeepholeOpt: Remove subreg def check for insert_subreg (PR #130085)
arsenm wrote: ### Merge activity * **Mar 6, 7:26 PM EST**: A user started a stack merge that includes this pull request via [Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/130085). https://github.com/llvm/llvm-project/pull/130085 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] PeepholeOpt: Remove subreg def check for bitcast (PR #130086)
arsenm wrote: ### Merge activity * **Mar 6, 7:26 PM EST**: A user started a stack merge that includes this pull request via [Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/130086). https://github.com/llvm/llvm-project/pull/130086 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR mode (PR #130037)
https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/130037 >From c29d8202c06488a9466aea49dda4cf2b4663236e Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Mon, 23 Oct 2023 11:46:19 +0200 Subject: [PATCH 1/2] [AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR mode In dynamic VGPR mode, Waves must deallocate all VGPRs before exiting. If the shader program does not do this, hardware inserts `S_ALLOC_VGPR 0` before S_ENDPGM, but this may incur some performance cost. Therefore it's better if the compiler proactively generates that instruction. This patch extends `si-insert-waitcnts` to deallocate the VGPRs via a `S_ALLOC_VGPR 0` before any `S_ENDPGM` when in dynamic VGPR mode. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 60 +-- .../CodeGen/AMDGPU/release-vgprs-gfx12.mir| 356 ++ 2 files changed, 393 insertions(+), 23 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7e6bce2bf5f12..42ef23e836a58 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1647,17 +1647,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); } - // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM - // stores. In this case it can be useful to send a message to explicitly - // release all VGPRs before the stores have completed, but it is only safe to - // do this if: - // * there are no outstanding scratch stores - // * we are not in Dynamic VGPR mode + // In dynamic VGPR mode, we want to release the VGPRs before the wave exits. + // Technically the hardware will do this on its own if we don't, but that + // might cost extra cycles compared to doing it explicitly. + // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may + // have to wait for outstanding VMEM stores. In this case it can be useful to + // send a message to explicitly release all VGPRs before the stores have + // completed, but it is only safe to do this if there are no outstanding + // scratch stores. else if (MI.getOpcode() == AMDGPU::S_ENDPGM || MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { -if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() && -ScoreBrackets.getScoreRange(STORE_CNT) != 0 && -!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)) +if (!WCG->isOptNone() && +(ST->isDynamicVGPREnabled() || + (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && + ScoreBrackets.getScoreRange(STORE_CNT) != 0 && + !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS ReleaseVGPRInsts.insert(&MI); } // Resolve vm waits before gs-done. @@ -2610,26 +2614,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } - // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM - // instructions. + // Deallocate the VGPRs before previously identified S_ENDPGM instructions. + // This is done in different ways depending on how the VGPRs were allocated + // (i.e. whether we're in dynamic VGPR mode or not). // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short // waveslot limited kernel runs slower with the deallocation. - if (!ReleaseVGPRInsts.empty() && - (MF.getFrameInfo().hasCalls() || - ST->getOccupancyWithNumVGPRs( - TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) < - AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) { + if (ST->isDynamicVGPREnabled()) { for (MachineInstr *MI : ReleaseVGPRInsts) { - if (ST->requiresNopBeforeDeallocVGPRs()) { -BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), -TII->get(AMDGPU::S_NOP)) -.addImm(0); - } BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::S_SENDMSG)) - .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); + TII->get(AMDGPU::S_ALLOC_VGPR)) + .addImm(0); Modified = true; } + } else { +if (!ReleaseVGPRInsts.empty() && +(MF.getFrameInfo().hasCalls() || + ST->getOccupancyWithNumVGPRs( + TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) < + AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) { + for (MachineInstr *MI : ReleaseVGPRInsts) { +if (ST->requiresNopBeforeDeallocVGPRs()) { + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_NOP)) + .addImm(0); +} +BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), +TII->get(AMDGPU::S_SENDMSG)) +.addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GF
[llvm-branch-commits] [flang] [flang][OpenMP] Accept old FLUSH syntax in METADIRECTIVE (PR #130122)
https://github.com/kparzysz updated https://github.com/llvm/llvm-project/pull/130122 >From bf56b8c80a0f1a7e06dcd3e898172c27e5afabf5 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Wed, 5 Mar 2025 08:24:30 -0600 Subject: [PATCH 1/3] [flang][OpenMP] Accept old FLUSH syntax in METADIRECTIVE Accommodate it in OmpDirectiveSpecification, which may become the primary component of the actual FLUSH construct in the future. --- flang/include/flang/Parser/dump-parse-tree.h | 1 + flang/include/flang/Parser/parse-tree.h | 6 ++- flang/lib/Parser/openmp-parsers.cpp | 32 +-- flang/lib/Parser/unparse.cpp | 28 +++--- .../Parser/OpenMP/metadirective-flush.f90 | 54 +++ 5 files changed, 109 insertions(+), 12 deletions(-) create mode 100644 flang/test/Parser/OpenMP/metadirective-flush.f90 diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index a154794e41e9d..fcd902d25fa40 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -491,6 +491,7 @@ class ParseTreeDumper { NODE(OmpWhenClause, Modifier) NODE(parser, OmpDirectiveName) NODE(parser, OmpDirectiveSpecification) + NODE_ENUM(OmpDirectiveSpecification, Flags) NODE(parser, OmpTraitPropertyName) NODE(parser, OmpTraitScore) NODE(parser, OmpTraitPropertyExtension) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 346299b8e5215..a197249ebae91 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -4503,13 +4503,15 @@ struct OmpClauseList { // --- Directives and constructs struct OmpDirectiveSpecification { - CharBlock source; + ENUM_CLASS(Flags, None, DeprecatedSyntax); TUPLE_CLASS_BOILERPLATE(OmpDirectiveSpecification); llvm::omp::Directive DirId() const { // return std::get(t).v; } + + CharBlock source; std::tuple>, - std::optional> + std::optional, Flags> t; }; diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index b3e76d70c8064..0de7690b90262 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -995,10 +995,34 @@ TYPE_PARSER(sourced(construct( // --- Parsers for directives and constructs -- -TYPE_PARSER(sourced(construct( // -sourced(OmpDirectiveNameParser{}), -maybe(parenthesized(nonemptyList(Parser{}))), -maybe(Parser{} +OmpDirectiveSpecification static makeFlushFromOldSyntax1( +Verbatim &&text, std::optional &&clauses, +std::optional> &&args, +OmpDirectiveSpecification::Flags &&flags) { + return OmpDirectiveSpecification{OmpDirectiveName(text), std::move(args), + std::move(clauses), std::move(flags)}; +} + +TYPE_PARSER(sourced( +// Parse the old syntax: FLUSH [clauses] [(objects)] +construct( // +// Force this old-syntax parser to fail for FLUSH followed by '('. +// Otherwise it could succeed on the new syntax but have one of +// lists absent in the parsed result. +// E.g. for FLUSH(x) SEQ_CST it would find no clauses following +// the directive name, parse the argument list "(x)" and stop. +applyFunction(makeFlushFromOldSyntax1, +verbatim("FLUSH"_tok) / !lookAhead("("_tok), +maybe(Parser{}), +maybe(parenthesized(nonemptyList(Parser{}))), +pure(OmpDirectiveSpecification::Flags::DeprecatedSyntax))) || +// Parse the standard syntax: directive [(arguments)] [clauses] +construct( // +sourced(OmpDirectiveNameParser{}), +maybe(parenthesized(nonemptyList(Parser{}))), +maybe(Parser{}), +pure(OmpDirectiveSpecification::Flags::None)) +)) TYPE_PARSER(sourced(construct("NOTHING" >> ok))) diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 4f5c05dc2aa25..262077e62441b 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2094,14 +2094,30 @@ class UnparseVisitor { Word(llvm::omp::getOpenMPDirectiveName(x).str()); } void Unparse(const OmpDirectiveSpecification &x) { -using ArgList = std::list; +auto unparseArgs{[&]() { + using ArgList = std::list; + if (auto &args{std::get>(x.t)}) { +Put("("); +Walk(*args); +Put(")"); + } +}}; +auto unparseClauses{[&]() { + Walk(std::get>(x.t)); +}}; + Walk(std::get(x.t)); -if (auto &args{std::get>(x.t)}) { - Put("("); - Walk(*args); - Put(")"); +auto flags{std::get(x.t)}; +if (flags == OmpDirectiveSpecification::Flags::DeprecatedSyntax) { + if (x.DirId() == llvm::omp::Directive::OMPD_flush) { +// FLUSH clause arglist +unparseClauses(); +unparseArgs(); + } +} else { + unparseArgs();
[llvm-branch-commits] [llvm] [AMDGPU] Allocate scratch space for dVGPRs for CWSR (PR #130055)
@@ -0,0 +1,263 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefix=CHECK %s + +; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack. + +define amdgpu_cs void @amdgpu_cs() #0 { +; CHECK-LABEL: amdgpu_cs: +; CHECK: ; %bb.0: +; CHECK-NEXT:s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT:s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT:s_cmp_lg_u32 0, s33 +; CHECK-NEXT:s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT:s_alloc_vgpr 0 +; CHECK-NEXT:s_endpgm + ret void +} + +define amdgpu_kernel void @kernel() #0 { +; CHECK-LABEL: kernel: +; CHECK: ; %bb.0: +; CHECK-NEXT:s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT:s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT:s_cmp_lg_u32 0, s33 +; CHECK-NEXT:s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT:s_alloc_vgpr 0 +; CHECK-NEXT:s_endpgm + ret void +} + +define amdgpu_cs void @with_local() #0 { +; CHECK-LABEL: with_local: +; CHECK: ; %bb.0: +; CHECK-NEXT:s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT:v_mov_b32_e32 v0, 13 +; CHECK-NEXT:s_cmp_lg_u32 0, s33 +; CHECK-NEXT:s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT:scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT:s_wait_storecnt 0x0 +; CHECK-NEXT:s_alloc_vgpr 0 +; CHECK-NEXT:s_endpgm + %local = alloca i32, addrspace(5) + store volatile i8 13, ptr addrspace(5) %local + ret void +} + +; Check that we generate s_cselect for SP if we can fit +; the offset in an inline constant. +define amdgpu_cs void @with_calls_inline_const() #0 { +; CHECK-LABEL: with_calls_inline_const: +; CHECK: ; %bb.0: +; CHECK-NEXT:s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT:v_mov_b32_e32 v0, 15 +; CHECK-NEXT:s_cmp_lg_u32 0, s33 +; CHECK-NEXT:s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT:s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT:s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT:scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT:s_wait_storecnt 0x0 +; CHECK-NEXT:v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT:s_cselect_b32 s32, 0x1d0, 16 +; CHECK-NEXT:s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT:s_alloc_vgpr 0 +; CHECK-NEXT:s_endpgm + %local = alloca i32, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +; Check that we generate s_mov + s_cmovk if we can't +; fit the offset for SP in an inline constant. +define amdgpu_cs void @with_calls_no_inline_const() #0 { +; CHECK-LABEL: with_calls_no_inline_const: +; CHECK: ; %bb.0: +; CHECK-NEXT:s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT:v_mov_b32_e32 v0, 15 +; CHECK-NEXT:s_cmp_lg_u32 0, s33 +; CHECK-NEXT:s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT:s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT:s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT:scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT:s_wait_storecnt 0x0 +; CHECK-NEXT:v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT:s_movk_i32 s32, 0x100 +; CHECK-NEXT:s_cmovk_i32 s32, 0x2c0 +; CHECK-NEXT:s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT:s_alloc_vgpr 0 +; CHECK-NEXT:s_endpgm + %local = alloca i32, i32 61, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +; We're going to limit this to 16 VGPRs, so we need to spill the rest. +define amdgpu_cs void @with_spills(ptr addrspace(1) %p1, ptr addrspace(1) %p2) #1 { +; CHECK-LABEL: with_spills: +; CHECK: ; %bb.0: +; CHECK-NEXT:s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT:global_load_b128 v[4:7], v[0:1], off offset:96 +; CHECK-NEXT:s_cmp_lg_u32 0, s33 +; CHECK-NEXT:s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT:s_wait_loadcnt 0x0 +; CHECK-NEXT:scratch_store_b128 off, v[4:7], s33 offset:80 ; 16-byte Folded Spill +; CHECK-NEXT:s_clause 0x2 +; CHECK-NEXT:global_load_b128 v[8:11], v[0:1], off offset:112 +; CHECK-NEXT:global_load_b128 v[12:15], v[0:1], off offset:64 +; CHECK-NEXT:global_load_b128 v[4:7], v[0:1], off offset:80 +; CHECK-NEXT:s_wait_loadcnt 0x0 +; CHECK-NEXT:scratch_store_b128 off, v[4:7], s33 offset:64 ; 16-byte Folded Spill +; CHECK-NEXT:global_load_b128 v[4:7], v[0:1], off offset:32 +; CHECK-NEXT:s_wait_loadcnt 0x0 +; CHECK-NEXT:scratch_store_b128 off, v[4:7], s33 offset:48 ; 16-byte Folded Spill +; CHECK-NEXT:global_load_b128 v[4:7], v[0:1], off offset:48 +; CHECK-NEXT:s_wait_loadcnt 0x0 +; CHECK-NEXT:scratch_store_b128 off, v[4:7], s33 offset:32 ; 16-byte Folded Spill +; CHECK-NEXT:global_load_b128 v[4:7], v[0:1], off +; CHECK-NEXT:s_wait_loadcnt 0x0 +; CHECK-NEXT:scratch_store_b128 off, v[4:7], s33 offset:16 ; 16-byte Folded Spill
[llvm-branch-commits] [llvm] [AMDGPU] Allocate scratch space for dVGPRs for CWSR (PR #130055)
@@ -0,0 +1,263 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefix=CHECK %s + +; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack. + +define amdgpu_cs void @amdgpu_cs() #0 { +; CHECK-LABEL: amdgpu_cs: +; CHECK: ; %bb.0: +; CHECK-NEXT:s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT:s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT:s_cmp_lg_u32 0, s33 +; CHECK-NEXT:s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT:s_alloc_vgpr 0 +; CHECK-NEXT:s_endpgm + ret void +} + +define amdgpu_kernel void @kernel() #0 { +; CHECK-LABEL: kernel: +; CHECK: ; %bb.0: +; CHECK-NEXT:s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT:s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT:s_cmp_lg_u32 0, s33 +; CHECK-NEXT:s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT:s_alloc_vgpr 0 +; CHECK-NEXT:s_endpgm + ret void +} + +define amdgpu_cs void @with_local() #0 { +; CHECK-LABEL: with_local: +; CHECK: ; %bb.0: +; CHECK-NEXT:s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT:v_mov_b32_e32 v0, 13 +; CHECK-NEXT:s_cmp_lg_u32 0, s33 +; CHECK-NEXT:s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT:scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT:s_wait_storecnt 0x0 +; CHECK-NEXT:s_alloc_vgpr 0 +; CHECK-NEXT:s_endpgm + %local = alloca i32, addrspace(5) + store volatile i8 13, ptr addrspace(5) %local + ret void +} + +; Check that we generate s_cselect for SP if we can fit +; the offset in an inline constant. +define amdgpu_cs void @with_calls_inline_const() #0 { +; CHECK-LABEL: with_calls_inline_const: +; CHECK: ; %bb.0: +; CHECK-NEXT:s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT:v_mov_b32_e32 v0, 15 +; CHECK-NEXT:s_cmp_lg_u32 0, s33 +; CHECK-NEXT:s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT:s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT:s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT:scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT:s_wait_storecnt 0x0 +; CHECK-NEXT:v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT:s_cselect_b32 s32, 0x1d0, 16 +; CHECK-NEXT:s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT:s_alloc_vgpr 0 +; CHECK-NEXT:s_endpgm + %local = alloca i32, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +; Check that we generate s_mov + s_cmovk if we can't +; fit the offset for SP in an inline constant. +define amdgpu_cs void @with_calls_no_inline_const() #0 { +; CHECK-LABEL: with_calls_no_inline_const: +; CHECK: ; %bb.0: +; CHECK-NEXT:s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT:v_mov_b32_e32 v0, 15 +; CHECK-NEXT:s_cmp_lg_u32 0, s33 +; CHECK-NEXT:s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT:s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT:s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT:scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT:s_wait_storecnt 0x0 +; CHECK-NEXT:v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT:s_movk_i32 s32, 0x100 +; CHECK-NEXT:s_cmovk_i32 s32, 0x2c0 +; CHECK-NEXT:s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT:s_alloc_vgpr 0 +; CHECK-NEXT:s_endpgm + %local = alloca i32, i32 61, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +; We're going to limit this to 16 VGPRs, so we need to spill the rest. +define amdgpu_cs void @with_spills(ptr addrspace(1) %p1, ptr addrspace(1) %p2) #1 { +; CHECK-LABEL: with_spills: +; CHECK: ; %bb.0: +; CHECK-NEXT:s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1) +; CHECK-NEXT:global_load_b128 v[4:7], v[0:1], off offset:96 +; CHECK-NEXT:s_cmp_lg_u32 0, s33 +; CHECK-NEXT:s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT:s_wait_loadcnt 0x0 +; CHECK-NEXT:scratch_store_b128 off, v[4:7], s33 offset:80 ; 16-byte Folded Spill +; CHECK-NEXT:s_clause 0x2 +; CHECK-NEXT:global_load_b128 v[8:11], v[0:1], off offset:112 +; CHECK-NEXT:global_load_b128 v[12:15], v[0:1], off offset:64 +; CHECK-NEXT:global_load_b128 v[4:7], v[0:1], off offset:80 +; CHECK-NEXT:s_wait_loadcnt 0x0 +; CHECK-NEXT:scratch_store_b128 off, v[4:7], s33 offset:64 ; 16-byte Folded Spill +; CHECK-NEXT:global_load_b128 v[4:7], v[0:1], off offset:32 +; CHECK-NEXT:s_wait_loadcnt 0x0 +; CHECK-NEXT:scratch_store_b128 off, v[4:7], s33 offset:48 ; 16-byte Folded Spill +; CHECK-NEXT:global_load_b128 v[4:7], v[0:1], off offset:48 +; CHECK-NEXT:s_wait_loadcnt 0x0 +; CHECK-NEXT:scratch_store_b128 off, v[4:7], s33 offset:32 ; 16-byte Folded Spill +; CHECK-NEXT:global_load_b128 v[4:7], v[0:1], off +; CHECK-NEXT:s_wait_loadcnt 0x0 +; CHECK-NEXT:scratch_store_b128 off, v[4:7], s33 offset:16 ; 16-byte Folded Spill
[llvm-branch-commits] [llvm] [AMDGPU] Allocate scratch space for dVGPRs for CWSR (PR #130055)
@@ -455,6 +455,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, unsigned NumSpilledSGPRs = 0; unsigned NumSpilledVGPRs = 0; + // The size of the scratch space reserved for the CWSR trap handler to spill + // some of the dynamic VGPRs. + unsigned ScratchReservedForDynamicVGPRs = 0; arsenm wrote: This is missing MIR serialization and associated tests https://github.com/llvm/llvm-project/pull/130055 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [mlir] [OpenMPIRBuilder] Introduce OMPRegionInfo managing the stack of OpenMP region constructs. (PR #130135)
https://github.com/Meinersbur edited https://github.com/llvm/llvm-project/pull/130135 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [HEXAGON] Fix hvx-isel for extract_subvector op (#129672) (PR #130215)
https://github.com/iajbar approved this pull request. https://github.com/llvm/llvm-project/pull/130215 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [HEXAGON] Fix hvx-isel for extract_subvector op (#129672) (PR #130215)
llvmbot wrote: @iajbar What do you think about merging this PR to the release branch? https://github.com/llvm/llvm-project/pull/130215 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [HEXAGON] Fix hvx-isel for extract_subvector op (#129672) (PR #130215)
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/130215 Backport 29d3fc3f11d272a72ac255af9277c740f26c3dfc Requested by: @androm3da >From 3c9189006713fbabe08a02e1d8fee0d79d7647a2 Mon Sep 17 00:00:00 2001 From: aankit-ca Date: Thu, 6 Mar 2025 15:02:10 -0800 Subject: [PATCH] [HEXAGON] Fix hvx-isel for extract_subvector op (#129672) Fixes a crash with extract_subvectors in Hexagon backend seen when the source vector is a vector-pair and result vector is not hvx vector size. LLVM Issue: https://github.com/llvm/llvm-project/issues/128775 Fixes #128775 - Co-authored-by: aankit-quic (cherry picked from commit 29d3fc3f11d272a72ac255af9277c740f26c3dfc) --- .../Target/Hexagon/HexagonISelLoweringHVX.cpp | 10 +- .../test/CodeGen/Hexagon/autohvx/fp-to-int.ll | 406 +- .../test/CodeGen/Hexagon/autohvx/int-to-fp.ll | 120 +++--- .../CodeGen/Hexagon/isel/extract-subvec.ll| 34 ++ 4 files changed, 302 insertions(+), 268 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/isel/extract-subvec.ll diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 816e063f8dbbe..1a19e81a68f08 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -1265,11 +1265,15 @@ HexagonTargetLowering::extractHvxSubvectorReg(SDValue OrigOp, SDValue VecV, // the subvector of interest. The subvector will never overlap two single // vectors. if (isHvxPairTy(VecTy)) { -if (Idx * ElemWidth >= 8*HwLen) +unsigned SubIdx = Hexagon::vsub_lo; +if (Idx * ElemWidth >= 8 * HwLen) { + SubIdx = Hexagon::vsub_hi; Idx -= VecTy.getVectorNumElements() / 2; +} -VecV = OrigOp; -if (typeSplit(VecTy).first == ResTy) +VecTy = typeSplit(VecTy).first; +VecV = DAG.getTargetExtractSubreg(SubIdx, dl, VecTy, VecV); +if (VecTy == ResTy) return VecV; } diff --git a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll index ac51662242de8..196b37678be61 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll @@ -13,13 +13,13 @@ define void @f16s8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT:{ ; CHECK-NEXT: r3:2 = combine(##32768,#1) ; CHECK-NEXT: r4 = #14 -; CHECK-NEXT: v1 = vmem(r0+#0) +; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT:} ; CHECK-NEXT:{ ; CHECK-NEXT: v2.h = vsplat(r3) ; CHECK-NEXT: r6 = #5 ; CHECK-NEXT: v3.h = vasl(v0.h,r2) -; CHECK-NEXT: v0.cur = vmem(r0+#1) +; CHECK-NEXT: v1 = vmem(r0+#1) ; CHECK-NEXT:} ; CHECK-NEXT:{ ; CHECK-NEXT: v4.h = vsplat(r4) @@ -33,55 +33,55 @@ define void @f16s8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT:} ; CHECK-NEXT:{ ; CHECK-NEXT: r3 = #16 -; CHECK-NEXT: v5.h = vasl(v1.h,r6) -; CHECK-NEXT: q1 = vcmp.gt(v7.h,v0.h) +; CHECK-NEXT: v5.h = vasl(v0.h,r6) +; CHECK-NEXT: q1 = vcmp.gt(v7.h,v1.h) ; CHECK-NEXT:} ; CHECK-NEXT:{ ; CHECK-NEXT: v6.h = vsplat(r3) -; CHECK-NEXT: v27.h = vasr(v3.h,r5) +; CHECK-NEXT: v28.h = vasr(v3.h,r5) ; CHECK-NEXT: v5 = vor(v5,v2) -; CHECK-NEXT: q0 = vcmp.gt(v7.h,v1.h) +; CHECK-NEXT: q0 = vcmp.gt(v7.h,v0.h) ; CHECK-NEXT:} ; CHECK-NEXT:{ ; CHECK-NEXT: v9.h = vsplat(r4) ; CHECK-NEXT: v8.h = vasr(v8.h,r5) ; CHECK-NEXT:} ; CHECK-NEXT:{ -; CHECK-NEXT: v26.h = vasl(v0.h,r6) -; CHECK-NEXT: v0.h = vsub(v4.h,v27.h) +; CHECK-NEXT: v27.h = vasl(v1.h,r6) +; CHECK-NEXT: v1.h = vsub(v4.h,v28.h) ; CHECK-NEXT: v4.h = vsub(v4.h,v8.h) -; CHECK-NEXT: v28 = vmux(q0,v2,v9) +; CHECK-NEXT: v29 = vmux(q0,v2,v9) ; CHECK-NEXT:} ; CHECK-NEXT:{ +; CHECK-NEXT: v1.h = vmin(v1.h,v6.h) +; CHECK-NEXT: v0 = vor(v27,v2) ; CHECK-NEXT: v4.h = vmin(v4.h,v6.h) -; CHECK-NEXT: v1 = vor(v26,v2) -; CHECK-NEXT: v0.h = vmin(v0.h,v6.h) ; CHECK-NEXT: v2 = vmux(q1,v2,v9) ; CHECK-NEXT:} ; CHECK-NEXT:{ -; CHECK-NEXT: q2 = vcmp.gt(v4.h,v7.h) -; CHECK-NEXT: q3 = vcmp.gt(v0.h,v7.h) +; CHECK-NEXT: q2 = vcmp.gt(v1.h,v7.h) +; CHECK-NEXT: q3 = vcmp.gt(v4.h,v7.h) ; CHECK-NEXT:} ; CHECK-NEXT:{ -; CHECK-NEXT: v5.h = vlsr(v5.h,v4.h) +; CHECK-NEXT: v5.h = vlsr(v5.h,v1.h) ; CHECK-NEXT:} ; CHECK-NEXT:{ -; CHECK-NEXT: v1.h = vlsr(v1.h,v0.h) -; CHECK-NEXT: v29.h = vsub(v7.h,v5.h) +; CHECK-NEXT: v0.h = vlsr(v0.h,v4.h) +; CHECK-NEXT: v30.h = vsub(v7.h,v5.h) ; CHECK-NEXT:} ; CHECK-NEXT:{ -; CHECK-NEXT: v30.h = vsub(v7.h,v1.h) -; CHECK-NEXT: v5 = vmux(q0,v29,v5) +; CHECK-NEXT: v31.h = vsub(v7.h,v0.h) +; CHECK-NEXT: v5 = vmux(q0,v30,v5) ; CHECK-NEXT:} ; CHECK-NEXT:{ -; CHECK-NEXT: v1 = vmux(q1,v30,v1) -; CHECK-NEXT: v31 = vmux(q2,v5,v28) +; CHECK-NEXT: v0 = vmux(q1,v31,v0) +; CHECK
[llvm-branch-commits] [llvm] [AMDGPU] Allocate scratch space for dVGPRs for CWSR (PR #130055)
@@ -552,6 +552,7 @@ enum Id { // HwRegCode, (6) [5:0] enum Offset : unsigned { // Offset, (5) [10:6] OFFSET_MEM_VIOL = 8, + OFFSET_ME_ID = 8, perlfu wrote: It's slightly confusing that this enumeration of offsets applies to multiple registers. Perhaps comment which register this is for? e.g. `OFFSET_ME_ID = 8, // in HW_ID2` https://github.com/llvm/llvm-project/pull/130055 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Allocate scratch space for dVGPRs for CWSR (PR #130055)
@@ -691,17 +691,61 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); - if (hasFP(MF)) { + unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST); + if (!mayReserveScratchForCWSR(MF)) { +if (hasFP(MF)) { + Register FPReg = MFI->getFrameOffsetReg(); + assert(FPReg != AMDGPU::FP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); +} + +if (requiresStackPointerReference(MF)) { + Register SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset); +} + } else { +// We need to check if we're on a compute queue - if we are, then the CWSR +// trap handler may need to store some VGPRs on the stack. The first VGPR +// block is saved separately, so we only need to allocate space for any +// additional VGPR blocks used. For now, we will make sure there's enough +// room for the theoretical maximum number of VGPRs that can be allocated. +// FIXME: Figure out if the shader uses fewer VGPRs in practice. +assert(hasFP(MF)); Register FPReg = MFI->getFrameOffsetReg(); assert(FPReg != AMDGPU::FP_REG); -BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); - } - - if (requiresStackPointerReference(MF)) { Register SPReg = MFI->getStackPtrOffsetReg(); assert(SPReg != AMDGPU::SP_REG); -BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) -.addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST)); +unsigned VGPRSize = +llvm::alignTo((ST.getAddressableNumVGPRs() - + AMDGPU::IsaInfo::getVGPRAllocGranule(&ST)) * + 4, + FrameInfo.getMaxAlign()); +MFI->setScratchReservedForDynamicVGPRs(VGPRSize); + +BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg) +.addImm(AMDGPU::Hwreg::HwregEncoding::encode( +AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 1)); perlfu wrote: Do you not need to retrieve 2 bits? i.e. AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, **2**) https://github.com/llvm/llvm-project/pull/130055 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libclc] release/20.x: [libclc] Stop installing CLC headers (#126908) (PR #130017)
https://github.com/llvmbot milestoned https://github.com/llvm/llvm-project/pull/130017 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libclc] release/20.x: [libclc] Stop installing CLC headers (#126908) (PR #130017)
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/130017 Backport a2b05761724e5243056988d9d6bf1a5a94715b74 Requested by: @frasercrmck >From 11b893892cd56d43420bde2d4de9038479cf Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Thu, 6 Mar 2025 08:52:23 + Subject: [PATCH] [libclc] Stop installing CLC headers (#126908) The libclc headers are an implementation detail and are not intended to be used by others as OpenCL headers. The only artifacts of libclc we want to publish are the LLVM bytecode libraries. As the headers have been incidentally broken by recent changes, this commit takes the step to stop installing the headers at all. Downstreams can use clang's own OpenCL headers, and/or its -fdeclare-opencl-builtins flag. Fixes #119967. (cherry picked from commit a2b05761724e5243056988d9d6bf1a5a94715b74) --- libclc/CMakeLists.txt | 1 - libclc/libclc.pc.in | 2 -- 2 files changed, 3 deletions(-) diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 43e213b385f5d..ad102e5100bde 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -226,7 +226,6 @@ set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii # pkg-config file configure_file( libclc.pc.in libclc.pc @ONLY ) install( FILES ${CMAKE_CURRENT_BINARY_DIR}/libclc.pc DESTINATION "${CMAKE_INSTALL_DATADIR}/pkgconfig" ) -install( DIRECTORY generic/include/clc DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" ) if( ENABLE_RUNTIME_SUBNORMAL ) foreach( file IN ITEMS subnormal_use_default subnormal_disable ) diff --git a/libclc/libclc.pc.in b/libclc/libclc.pc.in index b6e06c9673501..3a9e58b3ef0cc 100644 --- a/libclc/libclc.pc.in +++ b/libclc/libclc.pc.in @@ -1,8 +1,6 @@ -includedir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@ libexecdir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_DATADIR@/clc Name: libclc Description: Library requirements of the OpenCL C programming language Version: @PROJECT_VERSION@ -Cflags: -I${includedir} Libs: -L${libexecdir} ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libclc] release/20.x: [libclc] Stop installing CLC headers (#126908) (PR #130017)
arsenm wrote: Given the release is already out, I don't think it makes sense to pull this from the install in the release branch https://github.com/llvm/llvm-project/pull/130017 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [LoongArch] Relax the restrictions of inlineasm operand modifier 'u' and 'w' (#129864) (PR #130009)
https://github.com/llvmbot milestoned https://github.com/llvm/llvm-project/pull/130009 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [LoongArch] Relax the restrictions of inlineasm operand modifier 'u' and 'w' (#129864) (PR #130009)
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/130009 Backport bae6644e1227b2555f92b1962dac6c2444eaaaf2 Requested by: @SixWeining >From 82c916a7f9fa7110cb38da56fdeb5aeb2edad8ab Mon Sep 17 00:00:00 2001 From: Lu Weining Date: Thu, 6 Mar 2025 16:17:12 +0800 Subject: [PATCH] [LoongArch] Relax the restrictions of inlineasm operand modifier 'u' and 'w' (#129864) - Allow 'u' and 'w' on LASX, LSX or floating point register operands. - Also add missing description in LangRef. Fixes #129863. (cherry picked from commit bae6644e1227b2555f92b1962dac6c2444eaaaf2) --- llvm/docs/LangRef.rst | 2 + .../Target/LoongArch/LoongArchAsmPrinter.cpp | 35 ++-- .../lasx/inline-asm-operand-modifier.ll | 40 +++ 3 files changed, 64 insertions(+), 13 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index e002195cb7ed5..1c8eaa60e1c8a 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -5826,6 +5826,8 @@ Hexagon: LoongArch: +- ``u``: Print an LASX register. +- ``w``: Print an LSX register. - ``z``: Print $zero register if operand is zero, otherwise print it normally. MSP430: diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp index 169f9568e5362..895a8e2646692 100644 --- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp @@ -90,20 +90,29 @@ bool LoongArchAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, return false; } break; -case 'w': // Print LSX registers. - if (MO.getReg().id() >= LoongArch::VR0 && - MO.getReg().id() <= LoongArch::VR31) -break; - // The modifier is 'w' but the operand is not an LSX register; Report an - // unknown operand error. - return true; case 'u': // Print LASX registers. - if (MO.getReg().id() >= LoongArch::XR0 && - MO.getReg().id() <= LoongArch::XR31) -break; - // The modifier is 'u' but the operand is not an LASX register; Report an - // unknown operand error. - return true; +case 'w': // Print LSX registers. +{ + // If the operand is an LASX, LSX or floating point register, print the + // name of LASX or LSX register with the same index in that register + // class. + unsigned RegID = MO.getReg().id(), FirstReg; + if (RegID >= LoongArch::XR0 && RegID <= LoongArch::XR31) +FirstReg = LoongArch::XR0; + else if (RegID >= LoongArch::VR0 && RegID <= LoongArch::VR31) +FirstReg = LoongArch::VR0; + else if (RegID >= LoongArch::F0_64 && RegID <= LoongArch::F31_64) +FirstReg = LoongArch::F0_64; + else if (RegID >= LoongArch::F0 && RegID <= LoongArch::F31) +FirstReg = LoongArch::F0; + else +return true; + OS << '$' + << LoongArchInstPrinter::getRegisterName( +RegID - FirstReg + +(ExtraCode[0] == 'u' ? LoongArch::XR0 : LoongArch::VR0)); + return false; +} // TODO: handle other extra codes if any. } } diff --git a/llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll b/llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll index 201e34c8b5ae0..8b25a6525381b 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll @@ -12,3 +12,43 @@ entry: %0 = tail call <4 x i64> asm sideeffect "xvldi ${0:u}, 1", "=f"() ret void } + +define void @test_u_2xi64() nounwind { +; CHECK-LABEL: test_u_2xi64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT:#APP +; CHECK-NEXT:xvldi $xr0, 1 +; CHECK-NEXT:#NO_APP +; CHECK-NEXT:ret +entry: + %0 = tail call <2 x i64> asm sideeffect "xvldi ${0:u}, 1", "=f"() + ret void +} + +define void @test_w_4xi64() nounwind { +; CHECK-LABEL: test_w_4xi64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT:#APP +; CHECK-NEXT:vldi $vr0, 1 +; CHECK-NEXT:#NO_APP +; CHECK-NEXT:ret +entry: + %0 = tail call <4 x i64> asm sideeffect "vldi ${0:w}, 1", "=f"() + ret void +} + +define void @m128i_to_m256i(ptr %out, ptr %in) nounwind { +; CHECK-LABEL: m128i_to_m256i: +; CHECK: # %bb.0: +; CHECK-NEXT:vld $vr0, $a1, 0 +; CHECK-NEXT:xvrepli.b $xr1, 0 +; CHECK-NEXT:#APP +; CHECK-NEXT:xvpermi.q $xr1, $xr0, 32 +; CHECK-NEXT:#NO_APP +; CHECK-NEXT:xvst $xr1, $a0, 0 +; CHECK-NEXT:ret + %v = load <2 x i64>, ptr %in + %x = call <4 x i64> asm sideeffect "xvpermi.q ${0:u}, ${1:u}, 32", "=f,f,0"(<2 x i64> %v, <4 x i64> zeroinitializer) + store <4 x i64> %x, ptr %out + ret void +} ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [LoongArch] Relax the restrictions of inlineasm operand modifier 'u' and 'w' (#129864) (PR #130009)
llvmbot wrote: @llvm/pr-subscribers-backend-loongarch Author: None (llvmbot) Changes Backport bae6644e1227b2555f92b1962dac6c2444eaaaf2 Requested by: @SixWeining --- Full diff: https://github.com/llvm/llvm-project/pull/130009.diff 3 Files Affected: - (modified) llvm/docs/LangRef.rst (+2) - (modified) llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp (+22-13) - (modified) llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll (+40) ``diff diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index e002195cb7ed5..1c8eaa60e1c8a 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -5826,6 +5826,8 @@ Hexagon: LoongArch: +- ``u``: Print an LASX register. +- ``w``: Print an LSX register. - ``z``: Print $zero register if operand is zero, otherwise print it normally. MSP430: diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp index 169f9568e5362..895a8e2646692 100644 --- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp @@ -90,20 +90,29 @@ bool LoongArchAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, return false; } break; -case 'w': // Print LSX registers. - if (MO.getReg().id() >= LoongArch::VR0 && - MO.getReg().id() <= LoongArch::VR31) -break; - // The modifier is 'w' but the operand is not an LSX register; Report an - // unknown operand error. - return true; case 'u': // Print LASX registers. - if (MO.getReg().id() >= LoongArch::XR0 && - MO.getReg().id() <= LoongArch::XR31) -break; - // The modifier is 'u' but the operand is not an LASX register; Report an - // unknown operand error. - return true; +case 'w': // Print LSX registers. +{ + // If the operand is an LASX, LSX or floating point register, print the + // name of LASX or LSX register with the same index in that register + // class. + unsigned RegID = MO.getReg().id(), FirstReg; + if (RegID >= LoongArch::XR0 && RegID <= LoongArch::XR31) +FirstReg = LoongArch::XR0; + else if (RegID >= LoongArch::VR0 && RegID <= LoongArch::VR31) +FirstReg = LoongArch::VR0; + else if (RegID >= LoongArch::F0_64 && RegID <= LoongArch::F31_64) +FirstReg = LoongArch::F0_64; + else if (RegID >= LoongArch::F0 && RegID <= LoongArch::F31) +FirstReg = LoongArch::F0; + else +return true; + OS << '$' + << LoongArchInstPrinter::getRegisterName( +RegID - FirstReg + +(ExtraCode[0] == 'u' ? LoongArch::XR0 : LoongArch::VR0)); + return false; +} // TODO: handle other extra codes if any. } } diff --git a/llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll b/llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll index 201e34c8b5ae0..8b25a6525381b 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.ll @@ -12,3 +12,43 @@ entry: %0 = tail call <4 x i64> asm sideeffect "xvldi ${0:u}, 1", "=f"() ret void } + +define void @test_u_2xi64() nounwind { +; CHECK-LABEL: test_u_2xi64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT:#APP +; CHECK-NEXT:xvldi $xr0, 1 +; CHECK-NEXT:#NO_APP +; CHECK-NEXT:ret +entry: + %0 = tail call <2 x i64> asm sideeffect "xvldi ${0:u}, 1", "=f"() + ret void +} + +define void @test_w_4xi64() nounwind { +; CHECK-LABEL: test_w_4xi64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT:#APP +; CHECK-NEXT:vldi $vr0, 1 +; CHECK-NEXT:#NO_APP +; CHECK-NEXT:ret +entry: + %0 = tail call <4 x i64> asm sideeffect "vldi ${0:w}, 1", "=f"() + ret void +} + +define void @m128i_to_m256i(ptr %out, ptr %in) nounwind { +; CHECK-LABEL: m128i_to_m256i: +; CHECK: # %bb.0: +; CHECK-NEXT:vld $vr0, $a1, 0 +; CHECK-NEXT:xvrepli.b $xr1, 0 +; CHECK-NEXT:#APP +; CHECK-NEXT:xvpermi.q $xr1, $xr0, 32 +; CHECK-NEXT:#NO_APP +; CHECK-NEXT:xvst $xr1, $a0, 0 +; CHECK-NEXT:ret + %v = load <2 x i64>, ptr %in + %x = call <4 x i64> asm sideeffect "xvpermi.q ${0:u}, ${1:u}, 32", "=f,f,0"(<2 x i64> %v, <4 x i64> zeroinitializer) + store <4 x i64> %x, ptr %out + ret void +} `` https://github.com/llvm/llvm-project/pull/130009 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Support image_bvh8_intersect_ray instruction and intrinsic. (PR #130041)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Mariusz Sikora (mariusz-sikora-at-amd) Changes --- Patch is 23.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130041.diff 12 Files Affected: - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+11) - (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+1) - (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+13-8) - (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h (+2-1) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+12-6) - (modified) llvm/lib/Target/AMDGPU/MIMGInstructions.td (+18-14) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+10-6) - (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+8) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll (+87) - (modified) llvm/test/MC/AMDGPU/gfx12_asm_vimage.s (+3) - (modified) llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s (+3) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt (+3) ``diff diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f93439b30523e..d0ce9f0b8322d 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2812,6 +2812,17 @@ def int_amdgcn_image_bvh_dual_intersect_ray : llvm_v3f32_ty, llvm_v2i32_ty, llvm_v4i32_ty], [IntrReadMem, IntrWillReturn]>; +// , , +// llvm.amdgcn.image.bvh8.intersect.ray , , +//, , +//, , +// +def int_amdgcn_image_bvh8_intersect_ray : + Intrinsic<[llvm_v10i32_ty, llvm_v3f32_ty, llvm_v3f32_ty], +[llvm_i64_ty, llvm_float_ty, llvm_i8_ty, llvm_v3f32_ty, + llvm_v3f32_ty, llvm_i32_ty, llvm_v4i32_ty], +[IntrReadMem, IntrWillReturn]>; + // llvm.amdgcn.permlane16.var def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">, Intrinsic<[llvm_i32_ty], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 9c3bdd74a5cb0..8777a440c613b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4090,6 +4090,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { } case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: + case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY: return selectBVHIntersectRayIntrinsic(I); case AMDGPU::G_SBFX: case AMDGPU::G_UBFX: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index cd0554a5c5b99..3e4c946ee9010 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -7183,8 +7183,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic( return true; } -bool AMDGPULegalizerInfo::legalizeBVHDualIntrinsic(MachineInstr &MI, - MachineIRBuilder &B) const { +bool AMDGPULegalizerInfo::legalizeBVHDualOrBVH8IntersectRayIntrinsic( +MachineInstr &MI, MachineIRBuilder &B) const { const LLT S32 = LLT::scalar(32); const LLT V2S32 = LLT::fixed_vector(2, 32); @@ -7207,11 +7207,14 @@ bool AMDGPULegalizerInfo::legalizeBVHDualIntrinsic(MachineInstr &MI, return false; } + bool IsBVH8 = cast(MI).getIntrinsicID() == +Intrinsic::amdgcn_image_bvh8_intersect_ray; const unsigned NumVDataDwords = 10; - const unsigned NumVAddrDwords = 12; - int Opcode = AMDGPU::getMIMGOpcode(AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY, - AMDGPU::MIMGEncGfx12, NumVDataDwords, - NumVAddrDwords); + const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12; + int Opcode = AMDGPU::getMIMGOpcode( + IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY + : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY, + AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords); assert(Opcode != -1); SmallVector Ops; @@ -7223,7 +7226,8 @@ bool AMDGPULegalizerInfo::legalizeBVHDualIntrinsic(MachineInstr &MI, Ops.push_back(RayDir); Ops.push_back(Offsets); - auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY) + auto MIB = B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY + : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY) .addDef(DstReg) .addDef(DstOrigin) .addDef(DstDir) @@ -7587,7 +7591,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_image_bvh_intersect_ray: return legalizeBVHIntersectRayIntrinsic(MI, B); case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: -return legalizeBVHDualInt
[llvm-branch-commits] [llvm] [AMDGPU] Support image_bvh8_intersect_ray instruction and intrinsic. (PR #130041)
llvmbot wrote: @llvm/pr-subscribers-llvm-ir Author: Mariusz Sikora (mariusz-sikora-at-amd) Changes --- Patch is 23.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130041.diff 12 Files Affected: - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+11) - (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+1) - (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+13-8) - (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h (+2-1) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+12-6) - (modified) llvm/lib/Target/AMDGPU/MIMGInstructions.td (+18-14) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+10-6) - (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+8) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll (+87) - (modified) llvm/test/MC/AMDGPU/gfx12_asm_vimage.s (+3) - (modified) llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s (+3) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt (+3) ``diff diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f93439b30523e..d0ce9f0b8322d 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2812,6 +2812,17 @@ def int_amdgcn_image_bvh_dual_intersect_ray : llvm_v3f32_ty, llvm_v2i32_ty, llvm_v4i32_ty], [IntrReadMem, IntrWillReturn]>; +// , , +// llvm.amdgcn.image.bvh8.intersect.ray , , +//, , +//, , +// +def int_amdgcn_image_bvh8_intersect_ray : + Intrinsic<[llvm_v10i32_ty, llvm_v3f32_ty, llvm_v3f32_ty], +[llvm_i64_ty, llvm_float_ty, llvm_i8_ty, llvm_v3f32_ty, + llvm_v3f32_ty, llvm_i32_ty, llvm_v4i32_ty], +[IntrReadMem, IntrWillReturn]>; + // llvm.amdgcn.permlane16.var def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">, Intrinsic<[llvm_i32_ty], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 9c3bdd74a5cb0..8777a440c613b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4090,6 +4090,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { } case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: + case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY: return selectBVHIntersectRayIntrinsic(I); case AMDGPU::G_SBFX: case AMDGPU::G_UBFX: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index cd0554a5c5b99..3e4c946ee9010 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -7183,8 +7183,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic( return true; } -bool AMDGPULegalizerInfo::legalizeBVHDualIntrinsic(MachineInstr &MI, - MachineIRBuilder &B) const { +bool AMDGPULegalizerInfo::legalizeBVHDualOrBVH8IntersectRayIntrinsic( +MachineInstr &MI, MachineIRBuilder &B) const { const LLT S32 = LLT::scalar(32); const LLT V2S32 = LLT::fixed_vector(2, 32); @@ -7207,11 +7207,14 @@ bool AMDGPULegalizerInfo::legalizeBVHDualIntrinsic(MachineInstr &MI, return false; } + bool IsBVH8 = cast(MI).getIntrinsicID() == +Intrinsic::amdgcn_image_bvh8_intersect_ray; const unsigned NumVDataDwords = 10; - const unsigned NumVAddrDwords = 12; - int Opcode = AMDGPU::getMIMGOpcode(AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY, - AMDGPU::MIMGEncGfx12, NumVDataDwords, - NumVAddrDwords); + const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12; + int Opcode = AMDGPU::getMIMGOpcode( + IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY + : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY, + AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords); assert(Opcode != -1); SmallVector Ops; @@ -7223,7 +7226,8 @@ bool AMDGPULegalizerInfo::legalizeBVHDualIntrinsic(MachineInstr &MI, Ops.push_back(RayDir); Ops.push_back(Offsets); - auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY) + auto MIB = B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY + : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY) .addDef(DstReg) .addDef(DstOrigin) .addDef(DstDir) @@ -7587,7 +7591,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_image_bvh_intersect_ray: return legalizeBVHIntersectRayIntrinsic(MI, B); case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: -return legalizeBVHDualIntrinsic(
[llvm-branch-commits] [llvm] release/20.x: [LoongArch] Relax the restrictions of inlineasm operand modifier 'u' and 'w' (#129864) (PR #130009)
https://github.com/heiher approved this pull request. LGTM. https://github.com/llvm/llvm-project/pull/130009 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Add SubtargetFeature for dynamic VGPR mode (PR #130030)
https://github.com/rovka created https://github.com/llvm/llvm-project/pull/130030 This represents a hardware mode supported only for wave32 compute shaders. When enabled, we set the `.dynamic_vgpr_en` field of `.compute_registers` to true in the PAL metadata. >From b2a7bdc3954d2bf72e99d730ce00159c2550f563 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Mon, 23 Oct 2023 10:36:31 +0200 Subject: [PATCH] [AMDGPU] Add SubtargetFeature for dynamic VGPR mode This represents a hardware mode supported only for wave32 compute shaders. When enabled, we set the `.dynamic_vgpr_en` field of `.compute_registers` to true in the PAL metadata. --- llvm/docs/AMDGPUUsage.rst| 6 ++ llvm/lib/Target/AMDGPU/AMDGPU.td | 6 ++ llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 3 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h| 3 +++ llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll | 13 - 5 files changed, 26 insertions(+), 5 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index def6addd595e8..59cc08a59ed7c 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -758,6 +758,12 @@ For example: enabled will execute correctly but may be less performant than code generated for XNACK replay disabled. + + dynamic-vgprTODO Represents the "Dynamic VGPR" hardware mode, introduced in GFX12. + Waves launched in this mode may allocate or deallocate the VGPRs + using dedicated instructions, but may not send the DEALLOC_VGPRS + message. + === == .. _amdgpu-target-id: diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index effc8d2ed6b49..31a98ee132bf6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1239,6 +1239,12 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts", "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32" >; +def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr", + "DynamicVGPR", + "true", + "Enable dynamic VGPR mode" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 31e0bd8d652bc..13e61756e3036 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1414,6 +1414,9 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, MD->setHwStage(CC, ".trap_present", (bool)CurrentProgramInfo.TrapHandlerEnable); MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable); + +if (ST.isDynamicVGPREnabled()) + MD->setComputeRegisters(".dynamic_vgpr_en", true); } MD->setHwStage(CC, ".lds_size", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 6664a70572ded..1254cbad83b60 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -190,6 +190,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// indicates a lack of S_CLAUSE support. unsigned MaxHardClauseLength = 0; bool SupportsSRAMECC = false; + bool DynamicVGPR = false; // This should not be used directly. 'TargetID' tracks the dynamic settings // for SRAMECC. @@ -1647,6 +1648,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return true; } + bool isDynamicVGPREnabled() const { return DynamicVGPR; } + bool requiresDisjointEarlyClobberAndUndef() const override { // AMDGPU doesn't care if early-clobber and undef operands are allocated // to the same register. diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll index 7536e83a9da6b..fa22089978c2e 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll @@ -1,4 +1,6 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11 +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr <%s | FileCheck %s --check-prefixes=CHECK,DVGPR ; CHECK-LABEL: {{^}}_amdgpu_cs_main: ; CHECK: ; TotalNumSgprs: 4 @@ -8,6 +10,7 @@ ; CHECK-NEXT: amdpal.pipelines: ; CHECK-NEXT: - .api:Vulkan ; CHECK-NEXT: .compute_registers: +; DVGPR-NEXT: .dynamic_vg
[llvm-branch-commits] [llvm] [AMDGPU] Add SubtargetFeature for dynamic VGPR mode (PR #130030)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Diana Picus (rovka) Changes This represents a hardware mode supported only for wave32 compute shaders. When enabled, we set the `.dynamic_vgpr_en` field of `.compute_registers` to true in the PAL metadata. --- Full diff: https://github.com/llvm/llvm-project/pull/130030.diff 5 Files Affected: - (modified) llvm/docs/AMDGPUUsage.rst (+6) - (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+6) - (modified) llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (+3) - (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+3) - (modified) llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll (+8-5) ``diff diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index def6addd595e8..59cc08a59ed7c 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -758,6 +758,12 @@ For example: enabled will execute correctly but may be less performant than code generated for XNACK replay disabled. + + dynamic-vgprTODO Represents the "Dynamic VGPR" hardware mode, introduced in GFX12. + Waves launched in this mode may allocate or deallocate the VGPRs + using dedicated instructions, but may not send the DEALLOC_VGPRS + message. + === == .. _amdgpu-target-id: diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index effc8d2ed6b49..31a98ee132bf6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1239,6 +1239,12 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts", "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32" >; +def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr", + "DynamicVGPR", + "true", + "Enable dynamic VGPR mode" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 31e0bd8d652bc..13e61756e3036 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1414,6 +1414,9 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, MD->setHwStage(CC, ".trap_present", (bool)CurrentProgramInfo.TrapHandlerEnable); MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable); + +if (ST.isDynamicVGPREnabled()) + MD->setComputeRegisters(".dynamic_vgpr_en", true); } MD->setHwStage(CC, ".lds_size", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 6664a70572ded..1254cbad83b60 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -190,6 +190,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// indicates a lack of S_CLAUSE support. unsigned MaxHardClauseLength = 0; bool SupportsSRAMECC = false; + bool DynamicVGPR = false; // This should not be used directly. 'TargetID' tracks the dynamic settings // for SRAMECC. @@ -1647,6 +1648,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return true; } + bool isDynamicVGPREnabled() const { return DynamicVGPR; } + bool requiresDisjointEarlyClobberAndUndef() const override { // AMDGPU doesn't care if early-clobber and undef operands are allocated // to the same register. diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll index 7536e83a9da6b..fa22089978c2e 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll @@ -1,4 +1,6 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11 +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr <%s | FileCheck %s --check-prefixes=CHECK,DVGPR ; CHECK-LABEL: {{^}}_amdgpu_cs_main: ; CHECK: ; TotalNumSgprs: 4 @@ -8,6 +10,7 @@ ; CHECK-NEXT: amdpal.pipelines: ; CHECK-NEXT: - .api:Vulkan ; CHECK-NEXT: .compute_registers: +; DVGPR-NEXT: .dynamic_vgpr_en: true ; CHECK-NEXT: .tg_size_en: true ; CHECK-NEXT: .tgid_x_en: false ; CHECK-NEXT: .tgid_y_en: false @@ -57,7 +60,7 @@ ; CHECK-NEXT:.entry_point_symbol:_amdgpu_cs_main ; CHECK-NEXT:.excp_en:0 ; CHECK-NEXT:.float_mode: 0xc0 -; CHECK-NEXT
[llvm-branch-commits] [llvm] release/20.x: [AArch64] Fix SVE scalar fcopysign lowering without neon. (#129787) (PR #129997)
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/129997 Backport 4c2d1b4c53de d4ab3df320f9 Requested by: @davemgreen >From e9619c1c70840718b0a59901d2da788b176597ee Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 4 Mar 2025 21:46:55 + Subject: [PATCH 1/2] [AArch64] Add test for scalar copysign. NFC (cherry picked from commit 4c2d1b4c53def85e16d3612b92379a347d76baf0) --- ...e-streaming-mode-fixed-length-fcopysign.ll | 228 ++ 1 file changed, 228 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll index 2282e74af5d00..238c124b7cb06 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -8,6 +8,234 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" +define void @test_copysign_f16(ptr %ap, ptr %bp) { +; SVE-LABEL: test_copysign_f16: +; SVE: // %bb.0: +; SVE-NEXT:adrp x8, .LCPI0_0 +; SVE-NEXT:ldr h1, [x0] +; SVE-NEXT:ldr h2, [x1] +; SVE-NEXT:ldr q0, [x8, :lo12:.LCPI0_0] +; SVE-NEXT:adrp x8, .LCPI0_1 +; SVE-NEXT:ldr q4, [x8, :lo12:.LCPI0_1] +; SVE-NEXT:mov z3.d, z0.d +; SVE-NEXT:fmov s0, s1 +; SVE-NEXT:fmov s3, s2 +; SVE-NEXT:bif v0.16b, v3.16b, v4.16b +; SVE-NEXT:str h0, [x0] +; SVE-NEXT:ret +; +; SVE2-LABEL: test_copysign_f16: +; SVE2: // %bb.0: +; SVE2-NEXT:adrp x8, .LCPI0_0 +; SVE2-NEXT:ldr h1, [x0] +; SVE2-NEXT:ldr h2, [x1] +; SVE2-NEXT:ldr q0, [x8, :lo12:.LCPI0_0] +; SVE2-NEXT:adrp x8, .LCPI0_1 +; SVE2-NEXT:ldr q4, [x8, :lo12:.LCPI0_1] +; SVE2-NEXT:mov z3.d, z0.d +; SVE2-NEXT:fmov s0, s1 +; SVE2-NEXT:fmov s3, s2 +; SVE2-NEXT:bif v0.16b, v3.16b, v4.16b +; SVE2-NEXT:str h0, [x0] +; SVE2-NEXT:ret +; +; NONEON-NOSVE-LABEL: test_copysign_f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT:sub sp, sp, #16 +; NONEON-NOSVE-NEXT:.cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT:ldr h0, [x0] +; NONEON-NOSVE-NEXT:ldr h1, [x1] +; NONEON-NOSVE-NEXT:fcvt s0, h0 +; NONEON-NOSVE-NEXT:str h1, [sp, #12] +; NONEON-NOSVE-NEXT:ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT:tst w8, #0x80 +; NONEON-NOSVE-NEXT:fabs s0, s0 +; NONEON-NOSVE-NEXT:fneg s1, s0 +; NONEON-NOSVE-NEXT:fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT:fcvt h0, s0 +; NONEON-NOSVE-NEXT:str h0, [x0] +; NONEON-NOSVE-NEXT:add sp, sp, #16 +; NONEON-NOSVE-NEXT:ret + %a = load half, ptr %ap + %b = load half, ptr %bp + %r = call half @llvm.copysign.f16(half %a, half %b) + store half %r, ptr %ap + ret void +} + +define void @test_copysign_bf16(ptr %ap, ptr %bp) { +; SVE-LABEL: test_copysign_bf16: +; SVE: // %bb.0: +; SVE-NEXT:adrp x8, .LCPI1_0 +; SVE-NEXT:ldr h1, [x0] +; SVE-NEXT:ldr h2, [x1] +; SVE-NEXT:ldr q0, [x8, :lo12:.LCPI1_0] +; SVE-NEXT:adrp x8, .LCPI1_1 +; SVE-NEXT:ldr q4, [x8, :lo12:.LCPI1_1] +; SVE-NEXT:mov z3.d, z0.d +; SVE-NEXT:fmov s0, s1 +; SVE-NEXT:fmov s3, s2 +; SVE-NEXT:bif v0.16b, v3.16b, v4.16b +; SVE-NEXT:str h0, [x0] +; SVE-NEXT:ret +; +; SVE2-LABEL: test_copysign_bf16: +; SVE2: // %bb.0: +; SVE2-NEXT:adrp x8, .LCPI1_0 +; SVE2-NEXT:ldr h1, [x0] +; SVE2-NEXT:ldr h2, [x1] +; SVE2-NEXT:ldr q0, [x8, :lo12:.LCPI1_0] +; SVE2-NEXT:adrp x8, .LCPI1_1 +; SVE2-NEXT:ldr q4, [x8, :lo12:.LCPI1_1] +; SVE2-NEXT:mov z3.d, z0.d +; SVE2-NEXT:fmov s0, s1 +; SVE2-NEXT:fmov s3, s2 +; SVE2-NEXT:bif v0.16b, v3.16b, v4.16b +; SVE2-NEXT:str h0, [x0] +; SVE2-NEXT:ret +; +; NONEON-NOSVE-LABEL: test_copysign_bf16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT:sub sp, sp, #80 +; NONEON-NOSVE-NEXT:.cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT:ldr h0, [x0] +; NONEON-NOSVE-NEXT:ldr h1, [x1] +; NONEON-NOSVE-NEXT:str h0, [sp, #40] +; NONEON-NOSVE-NEXT:ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT:str h1, [sp, #76] +; NONEON-NOSVE-NEXT:ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT:str q0, [sp] +; NONEON-NOSVE-NEXT:ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT:lsl w9, w8, #16 +; NONEON-NOSVE-NEXT:ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT:lsl w8, w8, #16 +; NONEON-NOSVE-NEXT:stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT:ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT:lsl w9, w8, #16 +; NONEON-NOSVE-NEXT:ldr w8, [sp] +; NONEON-NOSVE-NEXT:lsl w8, w8, #16 +; NONEON-NOSVE-NEXT:stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT:ldrb w8, [sp, #77] +; NONEON-NOSVE-NEXT:ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT:tst w8, #0x80 +; NONEON-NOSVE-NEXT:str q0, [sp, #48] +; NONEON-NOSVE-NEXT:ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT:fabs s0, s0 +; NONEON-NOSVE-NEXT:fneg s1, s0 +; NONEON-NOSVE-NEXT:fcsel s0, s1, s0, ne +; NONEON-
[llvm-branch-commits] [llvm] [AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs (PR #130047)
https://github.com/rovka created https://github.com/llvm/llvm-project/pull/130047 In dynamic VGPR mode, we can allocate up to 8 blocks of either 16 or 32 VGPRs (based on a chip-wide setting which we can model with a Subtarget feature). Update some of the subtarget helpers to reflect this. In particular: - getVGPRAllocGranule is set to the block size - getAddresableNumVGPR will limit itself to 8 * size of a block We also try to be more careful about how many VGPR blocks we allocate. Therefore, when deciding if we should revert scheduling after a given stage, we check that we haven't increased the number of VGPR blocks that need to be allocated. >From ea460637afd43c854e70d184708ab0fcd2a20f73 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Mon, 5 Feb 2024 13:48:16 +0100 Subject: [PATCH] [AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs In dynamic VGPR mode, we can allocate up to 8 blocks of either 16 or 32 VGPRs (based on a chip-wide setting which we can model with a Subtarget feature). Update some of the subtarget helpers to reflect this. In particular: - getVGPRAllocGranule is set to the block size - getAddresableNumVGPR will limit itself to 8 * size of a block We also try to be more careful about how many VGPR blocks we allocate. Therefore, when deciding if we should revert scheduling after a given stage, we check that we haven't increased the number of VGPR blocks that need to be allocated. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 6 ++ llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 10 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 1 + .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 6 ++ .../Target/AMDGPU/AMDGPUUnitTests.cpp | 62 +++ llvm/unittests/Target/AMDGPU/CMakeLists.txt | 1 + 6 files changed, 86 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 31a98ee132bf6..339eeec72da46 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1245,6 +1245,12 @@ def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr", "Enable dynamic VGPR mode" >; +def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32", + "DynamicVGPRBlockSize32", + "true", + "Use a block size of 32 for dynamic VGPR allocation (default is 16)" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index c277223de13ac..4cc71f321f8f2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1452,6 +1452,16 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { if (WavesAfter < DAG.MinOccupancy) return true; + // For dynamic VGPR mode, we don't want to waste any VGPR blocks. + if (ST.isDynamicVGPREnabled()) { +unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( +&ST, PressureBefore.getVGPRNum(false)); +unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( +&ST, PressureAfter.getVGPRNum(false)); +if (BlocksAfter > BlocksBefore) + return true; + } + return false; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 1254cbad83b60..9ccf38fb4dbbe 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -191,6 +191,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, unsigned MaxHardClauseLength = 0; bool SupportsSRAMECC = false; bool DynamicVGPR = false; + bool DynamicVGPRBlockSize32 = false; // This should not be used directly. 'TargetID' tracks the dynamic settings // for SRAMECC. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b51cf536467b9..bebbb0dde0b9b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1154,6 +1154,9 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, if (STI->getFeatureBits().test(FeatureGFX90AInsts)) return 8; + if (STI->getFeatureBits().test(FeatureDynamicVGPR)) +return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16; + bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : STI->getFeatureBits().test(FeatureWavefrontSize32); @@ -1195,6 +1198,9 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; } unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { if (STI->getFeatureBits().test(FeatureGFX90AInsts)) return 512; + if (STI->getFeatureBits().test(FeatureDynamicVGPR)) +// On GFX12 we can allocate at most 8 blocks of VGPRs. +return 8 * getVGPRAllocGranule(STI); return getAddressableNumArchVGPRs(STI); } diff --git a/llvm
[llvm-branch-commits] [llvm] [AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs (PR #130047)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Diana Picus (rovka) Changes In dynamic VGPR mode, we can allocate up to 8 blocks of either 16 or 32 VGPRs (based on a chip-wide setting which we can model with a Subtarget feature). Update some of the subtarget helpers to reflect this. In particular: - getVGPRAllocGranule is set to the block size - getAddresableNumVGPR will limit itself to 8 * size of a block We also try to be more careful about how many VGPR blocks we allocate. Therefore, when deciding if we should revert scheduling after a given stage, we check that we haven't increased the number of VGPR blocks that need to be allocated. --- Full diff: https://github.com/llvm/llvm-project/pull/130047.diff 6 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+6) - (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (+10) - (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+1) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+6) - (modified) llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp (+62) - (modified) llvm/unittests/Target/AMDGPU/CMakeLists.txt (+1) ``diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 31a98ee132bf6..339eeec72da46 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1245,6 +1245,12 @@ def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr", "Enable dynamic VGPR mode" >; +def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32", + "DynamicVGPRBlockSize32", + "true", + "Use a block size of 32 for dynamic VGPR allocation (default is 16)" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index c277223de13ac..4cc71f321f8f2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1452,6 +1452,16 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { if (WavesAfter < DAG.MinOccupancy) return true; + // For dynamic VGPR mode, we don't want to waste any VGPR blocks. + if (ST.isDynamicVGPREnabled()) { +unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( +&ST, PressureBefore.getVGPRNum(false)); +unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( +&ST, PressureAfter.getVGPRNum(false)); +if (BlocksAfter > BlocksBefore) + return true; + } + return false; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 1254cbad83b60..9ccf38fb4dbbe 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -191,6 +191,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, unsigned MaxHardClauseLength = 0; bool SupportsSRAMECC = false; bool DynamicVGPR = false; + bool DynamicVGPRBlockSize32 = false; // This should not be used directly. 'TargetID' tracks the dynamic settings // for SRAMECC. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b51cf536467b9..bebbb0dde0b9b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1154,6 +1154,9 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, if (STI->getFeatureBits().test(FeatureGFX90AInsts)) return 8; + if (STI->getFeatureBits().test(FeatureDynamicVGPR)) +return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16; + bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : STI->getFeatureBits().test(FeatureWavefrontSize32); @@ -1195,6 +1198,9 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; } unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { if (STI->getFeatureBits().test(FeatureGFX90AInsts)) return 512; + if (STI->getFeatureBits().test(FeatureDynamicVGPR)) +// On GFX12 we can allocate at most 8 blocks of VGPRs. +return 8 * getVGPRAllocGranule(STI); return getAddressableNumArchVGPRs(STI); } diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp index 8fbd470815b79..21f45443281e7 100644 --- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp +++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp @@ -152,6 +152,24 @@ static void testGPRLimits(const char *RegName, bool TestW32W64, EXPECT_TRUE(ErrStr.empty()) << ErrStr; } +static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS, + TestFuncTy test) { + auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, + "+dynamic-vgpr," + FS.str()); + ASSERT_TRUE(TM) << "No target machine"; + +
[llvm-branch-commits] [llvm] [AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR mode (PR #130037)
@@ -0,0 +1,356 @@ +# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT +# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR arsenm wrote: -O2 is the default, and you are using run-pass anyway https://github.com/llvm/llvm-project/pull/130037 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen][StaticDataSplitter]Support constant pool partitioning (PR #129781)
@@ -112,21 +117,52 @@ bool StaticDataSplitter::runOnMachineFunction(MachineFunction &MF) { return Changed; } +const Constant * +StaticDataSplitter::getConstant(const MachineOperand &Op, +const TargetMachine &TM, +const MachineConstantPool *MCP) { + if (!Op.isGlobal() && !Op.isCPI()) +return nullptr; + + if (Op.isGlobal()) { +// Find global variables with local linkage. +const GlobalVariable *GV = getLocalLinkageGlobalVariable(Op.getGlobal()); +// Skip 'special' global variables conservatively because they are +// often handled specially, and skip those not in static data +// sections. +if (!GV || GV->getName().starts_with("llvm.") || +!inStaticDataSection(GV, TM)) + return nullptr; +return GV; + } + assert(Op.isCPI() && "Op must be constant pool index in this branch"); + int CPI = Op.getIndex(); + if (CPI == -1) +return nullptr; + + assert(MCP != nullptr && "Constant pool info is not available."); + const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI]; + + if (CPE.isMachineConstantPoolEntry()) +return nullptr; + + return CPE.Val.ConstVal; +} + bool StaticDataSplitter::partitionStaticDataWithProfiles(MachineFunction &MF) { int NumChangedJumpTables = 0; - const TargetMachine &TM = MF.getTarget(); MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); // Jump table could be used by either terminating instructions or // non-terminating ones, so we walk all instructions and use // `MachineOperand::isJTI()` to identify jump table operands. - // Similarly, `MachineOperand::isCPI()` can identify constant pool usages - // in the same loop. + // Similarly, `MachineOperand::isCPI()` is used to identify constant pool + // usages in the same loop. for (const auto &MBB : MF) { for (const MachineInstr &I : MBB) { for (const MachineOperand &Op : I.operands()) { -if (!Op.isJTI() && !Op.isGlobal()) +if (!Op.isJTI() && !Op.isGlobal() && !Op.isCPI()) continue; std::optional Count = MBFI->getBlockProfileCount(&MBB); williamweixiao wrote: line 168 can be hoisted out to the outmost loop (i.e line 163). https://github.com/llvm/llvm-project/pull/129781 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] 1d72977 - Revert "[mlir][ODS] Add a generated builder that takes the Properties struct …"
Author: Benjamin Chetioui Date: 2025-03-06T16:26:44+01:00 New Revision: 1d72977e0198033e8ae7f7317abec09d59b330de URL: https://github.com/llvm/llvm-project/commit/1d72977e0198033e8ae7f7317abec09d59b330de DIFF: https://github.com/llvm/llvm-project/commit/1d72977e0198033e8ae7f7317abec09d59b330de.diff LOG: Revert "[mlir][ODS] Add a generated builder that takes the Properties struct …" This reverts commit 35622a93bb034ad5c56e3a490060648b35ba49f1. Added: Modified: mlir/docs/DeclarativeRewrites.md mlir/docs/DefiningDialects/Operations.md mlir/include/mlir/IR/OpDefinition.h mlir/include/mlir/IR/OperationSupport.h mlir/test/lib/Dialect/Test/TestOps.td mlir/test/mlir-tblgen/op-attribute.td mlir/test/mlir-tblgen/op-decl-and-defs.td mlir/test/mlir-tblgen/op-result.td mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp mlir/unittests/TableGen/OpBuildGen.cpp Removed: diff --git a/mlir/docs/DeclarativeRewrites.md b/mlir/docs/DeclarativeRewrites.md index fd566a2393b63..888ce57fa3b53 100644 --- a/mlir/docs/DeclarativeRewrites.md +++ b/mlir/docs/DeclarativeRewrites.md @@ -237,9 +237,9 @@ In the above, we are using `BOp`'s result for building `COp`. Given that `COp` was specified with table-driven op definition, there will be several `build()` methods generated for it. One of them has aggregated -parameters for result types, operands, and properties in the signature: `void +parameters for result types, operands, and attributes in the signature: `void COp::build(..., ArrayRef resultTypes, Array operands, -const COp::Properties& properties)`. The pattern in the above calls this `build()` +ArrayRef attr)`. The pattern in the above calls this `build()` method for constructing the `COp`. In general, arguments in the result pattern will be passed directly to the diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md index 528070cd3ebff..8ff60ac21424c 100644 --- a/mlir/docs/DefiningDialects/Operations.md +++ b/mlir/docs/DefiningDialects/Operations.md @@ -465,18 +465,7 @@ def MyOp : ... { The following builders are generated: ```c++ -// All result-types/operands/properties/discardable attributes have one -// aggregate parameter. `Properties` is the properties structure of -// `MyOp`. -static void build(OpBuilder &odsBuilder, OperationState &odsState, - TypeRange resultTypes, - ValueRange operands, - Properties properties, - ArrayRef discardableAttributes = {}); - // All result-types/operands/attributes have one aggregate parameter. -// Inherent properties and discardable attributes are mixed together in the -// `attributes` dictionary. static void build(OpBuilder &odsBuilder, OperationState &odsState, TypeRange resultTypes, ValueRange operands, @@ -509,28 +498,20 @@ static void build(OpBuilder &odsBuilder, OperationState &odsState, // All operands/attributes have aggregate parameters. // Generated if return type can be inferred. -static void build(OpBuilder &odsBuilder, OperationState &odsState, - ValueRange operands, - Properties properties, - ArrayRef discardableAttributes); - -// All operands/attributes have aggregate parameters. -// Generated if return type can be inferred. Uses the legacy merged attribute -// dictionary. static void build(OpBuilder &odsBuilder, OperationState &odsState, ValueRange operands, ArrayRef attributes); // (And manually specified builders depending on the specific op.) ``` -The first two forms provide basic uniformity so that we can create ops using -the same form regardless of the exact op. This is particularly useful for +The first form provides basic uniformity so that we can create ops using the +same form regardless of the exact op. This is particularly useful for implementing declarative pattern rewrites. -The third and fourth forms are good for use in manually written code, given that +The second and third forms are good for use in manually written code, given that they provide better guarantee via signatures. -The fourth form will be generated if any of the op's attribute has diff erent +The third form will be generated if any of the op's attribute has diff erent `Attr.returnType` from `Attr.storageType` and we know how to build an attribute from an unwrapped value (i.e., `Attr.constBuilderCall` is defined.) Additionally, for the third form, if an attribute appearing later in the diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h index 4fad61580b31a..d91c573c03efe 100644 --- a/mlir/include/mlir/IR/OpDefinition.h +++ b/mlir/include/mlir/IR/OpDefinition.h @@ -74,10 +74,7 @@ void ensureRegionTerminator( /// Structure used by default as a "marker" when no "Prop
[llvm-branch-commits] [llvm] [AMDGPU][NPM] Cleanup AMDGPUPassRegistry.def (PR #130071)
https://github.com/optimisan created https://github.com/llvm/llvm-project/pull/130071 None >From 462a05548fb65f3f8cf1310edd064ee1483c1c43 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Thu, 6 Mar 2025 10:56:28 + Subject: [PATCH] [AMDGPU][NPM] Cleanup AMDGPUPassRegistry.def --- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 8 +--- llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp| 1 + 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index f14499d0d3146..ad2f3fc29077c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -102,6 +102,7 @@ MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) +MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass()) MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass()) MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass()) @@ -131,13 +132,6 @@ MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass()) #undef MACHINE_FUNCTION_PASS #define DUMMY_MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) -DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) -DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) - -// TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it -// already exists. -DUMMY_MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass()) - // Global ISel passes DUMMY_MACHINE_FUNCTION_PASS("amdgpu-prelegalizer-combiner", AMDGPUPreLegalizerCombinerPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-postlegalizer-combiner", AMDGPUPostLegalizerCombinerPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp index b3a2139dfd24e..40094518dce0a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp @@ -207,5 +207,5 @@ AMDGPUPreloadKernArgPrologPass::run(MachineFunction &MF, if (!AMDGPUPreloadKernArgProlog(MF).run()) return PreservedAnalyses::all(); - return PreservedAnalyses::none(); + return getMachineFunctionPassPreservedAnalyses(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f380ddd03957f..a71766f2fd012 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -24,6 +24,7 @@ #include "AMDGPUMacroFusion.h" #include "AMDGPUOpenCLEnqueuedBlockLowering.h" #include "AMDGPUPerfHintAnalysis.h" +#include "AMDGPUPreloadKernArgProlog.h" #include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPUSplitModule.h" #include "AMDGPUTargetObjectFile.h" ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen][NPM] Port MachineSanitizerBinaryMetadata to NPM (PR #130069)
https://github.com/optimisan created https://github.com/llvm/llvm-project/pull/130069 None >From 003ff875ebf977ef373e8d039a31a5cbb3f8c853 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Thu, 6 Mar 2025 10:20:36 + Subject: [PATCH] [CodeGen][NPM] Port MachineSanitizerBinaryMetadata to NPM --- .../llvm/CodeGen/SanitizerBinaryMetadata.h| 26 + llvm/include/llvm/InitializePasses.h | 2 +- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 3 +- .../llvm/Passes/MachinePassRegistry.def | 2 +- llvm/lib/CodeGen/CodeGen.cpp | 2 +- llvm/lib/CodeGen/SanitizerBinaryMetadata.cpp | 37 ++- llvm/lib/Passes/PassBuilder.cpp | 1 + 7 files changed, 60 insertions(+), 13 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/SanitizerBinaryMetadata.h diff --git a/llvm/include/llvm/CodeGen/SanitizerBinaryMetadata.h b/llvm/include/llvm/CodeGen/SanitizerBinaryMetadata.h new file mode 100644 index 0..6cf2e11aa911e --- /dev/null +++ b/llvm/include/llvm/CodeGen/SanitizerBinaryMetadata.h @@ -0,0 +1,26 @@ +//===- llvm/CodeGen/SanitizerBinaryMetadata.h ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#ifndef LLVM_CODEGEN_SANITIZERBINARYMETADATA_H +#define LLVM_CODEGEN_SANITIZERBINARYMETADATA_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class MachineSanitizerBinaryMetadataPass +: public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, +MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_SANITIZERBINARYMETADATA_H diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 3fd3cbb28bc3e..c7bc4320cf8f0 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -206,7 +206,7 @@ void initializeMachineOutlinerPass(PassRegistry &); void initializeMachinePipelinerPass(PassRegistry &); void initializeMachinePostDominatorTreeWrapperPassPass(PassRegistry &); void initializeMachineRegionInfoPassPass(PassRegistry &); -void initializeMachineSanitizerBinaryMetadataPass(PassRegistry &); +void initializeMachineSanitizerBinaryMetadataLegacyPass(PassRegistry &); void initializeMachineSchedulerLegacyPass(PassRegistry &); void initializeMachineSinkingLegacyPass(PassRegistry &); void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index a86dc8d632a4e..74cdc7d66810b 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -75,6 +75,7 @@ #include "llvm/CodeGen/RenameIndependentSubregs.h" #include "llvm/CodeGen/ReplaceWithVeclib.h" #include "llvm/CodeGen/SafeStack.h" +#include "llvm/CodeGen/SanitizerBinaryMetadata.h" #include "llvm/CodeGen/SelectOptimize.h" #include "llvm/CodeGen/ShadowStackGCLowering.h" #include "llvm/CodeGen/SjLjEHPrepare.h" @@ -1002,7 +1003,7 @@ Error CodeGenPassBuilder::addMachinePasses( addPass(RemoveLoadsIntoFakeUsesPass()); addPass(StackMapLivenessPass()); addPass(LiveDebugValuesPass()); - addPass(MachineSanitizerBinaryMetadata()); + addPass(MachineSanitizerBinaryMetadataPass()); if (TM.Options.EnableMachineOutliner && getOptLevel() != CodeGenOptLevel::None && diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index cab8108ed30f6..8fa21751392f3 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -149,6 +149,7 @@ MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass()) MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass()) MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass()) MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass()) +MACHINE_FUNCTION_PASS("machine-sanmd", MachineSanitizerBinaryMetadataPass()) MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass(TM)) MACHINE_FUNCTION_PASS("machinelicm", MachineLICMPass()) MACHINE_FUNCTION_PASS("no-op-machine-function", NoOpMachineFunctionPass()) @@ -279,7 +280,6 @@ DUMMY_MACHINE_FUNCTION_PASS("lrshrink", LiveRangeShrinkPass) DUMMY_MACHINE_FUNCTION_PASS("machine-combiner", MachineCombinerPass) DUMMY_MACHINE_FUNCTION_PASS("static-data-splitter", StaticDataSplitter) DUMMY_MACHINE_FUNCTION_PASS("machine-function-splitter", MachineFunctionSplitterPass) -DUMMY_MACHINE_FUNCTION_PASS("machine-sanmd", MachineSanitizerBinaryMetadata) DUMMY_MACHINE_FUNCTION_PASS("machine-u
[llvm-branch-commits] [llvm] [CodeGen][NPM] Port RemoveLoadsIntoFakeUses to NPM (PR #130068)
https://github.com/optimisan created https://github.com/llvm/llvm-project/pull/130068 None >From e9865932db7cff474a1df1d71f9a01b2d6bec47f Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Thu, 6 Mar 2025 09:30:37 + Subject: [PATCH] [CodeGen][NPM] Port RemoveLoadsIntoFakeUses to NPM --- .../llvm/CodeGen/RemoveLoadsIntoFakeUses.h| 30 + llvm/include/llvm/InitializePasses.h | 2 +- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 2 + .../llvm/Passes/MachinePassRegistry.def | 2 +- llvm/lib/CodeGen/CodeGen.cpp | 2 +- llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp | 44 +++ llvm/lib/Passes/PassBuilder.cpp | 1 + .../CodeGen/X86/fake-use-remove-loads.mir | 2 + 8 files changed, 73 insertions(+), 12 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/RemoveLoadsIntoFakeUses.h diff --git a/llvm/include/llvm/CodeGen/RemoveLoadsIntoFakeUses.h b/llvm/include/llvm/CodeGen/RemoveLoadsIntoFakeUses.h new file mode 100644 index 0..bbd5b8b430bf6 --- /dev/null +++ b/llvm/include/llvm/CodeGen/RemoveLoadsIntoFakeUses.h @@ -0,0 +1,30 @@ +//===- llvm/CodeGen/RemoveLoadsIntoFakeUses.h ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#ifndef LLVM_CODEGEN_REMOVELOADSINTOFAKEUSES_H +#define LLVM_CODEGEN_REMOVELOADSINTOFAKEUSES_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class RemoveLoadsIntoFakeUsesPass +: public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, +MachineFunctionAnalysisManager &MFAM); + + MachineFunctionProperties getRequiredProperties() const { +return MachineFunctionProperties().set( +MachineFunctionProperties::Property::NoVRegs); + } +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_REMOVELOADSINTOFAKEUSES_H diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index e5bffde815117..3fd3cbb28bc3e 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -265,7 +265,7 @@ void initializeRegionOnlyViewerPass(PassRegistry &); void initializeRegionPrinterPass(PassRegistry &); void initializeRegionViewerPass(PassRegistry &); void initializeRegisterCoalescerLegacyPass(PassRegistry &); -void initializeRemoveLoadsIntoFakeUsesPass(PassRegistry &); +void initializeRemoveLoadsIntoFakeUsesLegacyPass(PassRegistry &); void initializeRemoveRedundantDebugValuesLegacyPass(PassRegistry &); void initializeRenameIndependentSubregsLegacyPass(PassRegistry &); void initializeReplaceWithVeclibLegacyPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index aab2c58ac0f78..a86dc8d632a4e 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -70,6 +70,7 @@ #include "llvm/CodeGen/RegUsageInfoPropagate.h" #include "llvm/CodeGen/RegisterCoalescerPass.h" #include "llvm/CodeGen/RegisterUsageInfo.h" +#include "llvm/CodeGen/RemoveLoadsIntoFakeUses.h" #include "llvm/CodeGen/RemoveRedundantDebugValues.h" #include "llvm/CodeGen/RenameIndependentSubregs.h" #include "llvm/CodeGen/ReplaceWithVeclib.h" @@ -998,6 +999,7 @@ Error CodeGenPassBuilder::addMachinePasses( addPass(FuncletLayoutPass()); + addPass(RemoveLoadsIntoFakeUsesPass()); addPass(StackMapLivenessPass()); addPass(LiveDebugValuesPass()); addPass(MachineSanitizerBinaryMetadata()); diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 9300f6935aa90..cab8108ed30f6 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -181,6 +181,7 @@ MACHINE_FUNCTION_PASS("reg-usage-collector", RegUsageInfoCollectorPass()) MACHINE_FUNCTION_PASS("reg-usage-propagation", RegUsageInfoPropagationPass()) MACHINE_FUNCTION_PASS("register-coalescer", RegisterCoalescerPass()) MACHINE_FUNCTION_PASS("rename-independent-subregs", RenameIndependentSubregsPass()) +MACHINE_FUNCTION_PASS("remove-loads-into-fake-uses", RemoveLoadsIntoFakeUsesPass()) MACHINE_FUNCTION_PASS("remove-redundant-debug-values", RemoveRedundantDebugValuesPass()) MACHINE_FUNCTION_PASS("require-all-machine-function-properties", RequireAllMachineFunctionPropertiesPass()) @@ -292,7 +293,6 @@ DUMMY_MACHINE_FUNCTION_PASS("ra-pbqp", RAPBQPPass) DUMMY_MACHINE_FUNCTION_PASS("regalloc", RegAllocPass) DUMMY_MACHINE_FUNCTION_PASS("regallocscoringpass", RegAllocScoringPass) DUMMY_MACHINE_FUNCTION_PASS("regbankselect", RegBankSelectPass) -DUMMY_MACHINE_FUNCTION_PASS("remove-loads-i
[llvm-branch-commits] [llvm] [CodeGen][NPM] Port StackFrameLayoutAnalysisPass to NPM (PR #130070)
https://github.com/optimisan created https://github.com/llvm/llvm-project/pull/130070 None >From 578b467fb9a8338e84833f6596768b8048bbd531 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Thu, 6 Mar 2025 10:45:25 + Subject: [PATCH] [CodeGen][NPM] Port StackFrameLayoutAnalysisPass to NPM --- .../CodeGen/StackFrameLayoutAnalysisPass.h| 26 llvm/include/llvm/InitializePasses.h | 2 +- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 3 + .../llvm/Passes/MachinePassRegistry.def | 2 +- llvm/lib/CodeGen/CodeGen.cpp | 2 +- .../CodeGen/StackFrameLayoutAnalysisPass.cpp | 61 +-- llvm/lib/Passes/PassBuilder.cpp | 1 + .../CodeGen/X86/stack-frame-layout-remarks.ll | 1 + 8 files changed, 75 insertions(+), 23 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/StackFrameLayoutAnalysisPass.h diff --git a/llvm/include/llvm/CodeGen/StackFrameLayoutAnalysisPass.h b/llvm/include/llvm/CodeGen/StackFrameLayoutAnalysisPass.h new file mode 100644 index 0..5283cda30da12 --- /dev/null +++ b/llvm/include/llvm/CodeGen/StackFrameLayoutAnalysisPass.h @@ -0,0 +1,26 @@ +//===- llvm/CodeGen/StackFrameLayoutAnalysisPass.h --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#ifndef LLVM_CODEGEN_STACKFRAMELAYOUTANALYSISPASS_H +#define LLVM_CODEGEN_STACKFRAMELAYOUTANALYSISPASS_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class StackFrameLayoutAnalysisPass +: public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, +MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_STACKFRAMELAYOUTANALYSISPASS_H diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index c7bc4320cf8f0..9068aee8f8193 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -290,7 +290,7 @@ void initializeSlotIndexesWrapperPassPass(PassRegistry &); void initializeSpeculativeExecutionLegacyPassPass(PassRegistry &); void initializeSpillPlacementWrapperLegacyPass(PassRegistry &); void initializeStackColoringLegacyPass(PassRegistry &); -void initializeStackFrameLayoutAnalysisPassPass(PassRegistry &); +void initializeStackFrameLayoutAnalysisLegacyPass(PassRegistry &); void initializeStaticDataSplitterPass(PassRegistry &); void initializeStackMapLivenessPass(PassRegistry &); void initializeStackProtectorPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 74cdc7d66810b..8cba36b36fbb2 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -80,6 +80,7 @@ #include "llvm/CodeGen/ShadowStackGCLowering.h" #include "llvm/CodeGen/SjLjEHPrepare.h" #include "llvm/CodeGen/StackColoring.h" +#include "llvm/CodeGen/StackFrameLayoutAnalysisPass.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/CodeGen/StackSlotColoring.h" #include "llvm/CodeGen/TailDuplication.h" @@ -1015,6 +1016,8 @@ Error CodeGenPassBuilder::addMachinePasses( addPass(MachineOutlinerPass(RunOnAllFunctions)); } + addPass(StackFrameLayoutAnalysisPass()); + // Add passes that directly emit MI after all other MI passes. derived().addPreEmitPass2(addPass); diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 8fa21751392f3..01dd423de6955 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -187,6 +187,7 @@ MACHINE_FUNCTION_PASS("remove-redundant-debug-values", RemoveRedundantDebugValue MACHINE_FUNCTION_PASS("require-all-machine-function-properties", RequireAllMachineFunctionPropertiesPass()) MACHINE_FUNCTION_PASS("stack-coloring", StackColoringPass()) +MACHINE_FUNCTION_PASS("stack-frame-layout", StackFrameLayoutAnalysisPass()) MACHINE_FUNCTION_PASS("stack-slot-coloring", StackSlotColoringPass()) MACHINE_FUNCTION_PASS("tailduplication", TailDuplicatePass()) MACHINE_FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass()) @@ -295,7 +296,6 @@ DUMMY_MACHINE_FUNCTION_PASS("regallocscoringpass", RegAllocScoringPass) DUMMY_MACHINE_FUNCTION_PASS("regbankselect", RegBankSelectPass) DUMMY_MACHINE_FUNCTION_PASS("reset-machine-function", ResetMachineFunctionPass) DUMMY_MACHINE_FUNCTION_PASS("shrink-wrap", ShrinkWrapPass) -DUMMY_MACHINE_FUNCTION_PASS("stack-frame-layout", StackFrameLayoutAnalysisPass) DUMMY_MACHINE_FUNCTION_PASS("stackmap-liveness", StackMapLivenessPass) DU
[llvm-branch-commits] [llvm] [AMDGPU][NPM] Port SILateBranchLowering to NPM (PR #130063)
https://github.com/optimisan created https://github.com/llvm/llvm-project/pull/130063 None >From 5f050b8e6b439c534cbbfe36305560b7fa2c5cfa Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Thu, 6 Mar 2025 05:26:49 + Subject: [PATCH] [AMDGPU][NPM] Port SILateBranchLowering to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 10 - llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 ++- .../Target/AMDGPU/SILateBranchLowering.cpp| 40 ++- llvm/test/CodeGen/AMDGPU/early-term.mir | 2 + llvm/test/CodeGen/AMDGPU/readlane_exec0.mir | 1 + 6 files changed, 46 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 635cdf7ef12df..8224129eb882e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -213,7 +213,7 @@ extern char &SILowerControlFlowLegacyID; void initializeSIPreEmitPeepholePass(PassRegistry &); extern char &SIPreEmitPeepholeID; -void initializeSILateBranchLoweringPass(PassRegistry &); +void initializeSILateBranchLoweringLegacyPass(PassRegistry &); extern char &SILateBranchLoweringPassID; void initializeSIOptimizeExecMaskingLegacyPass(PassRegistry &); @@ -382,6 +382,14 @@ class SIInsertHardClausesPass : public PassInfoMixin { MachineFunctionAnalysisManager &MFAM); }; +class SILateBranchLoweringPass +: public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, +MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); ModulePass *createAMDGPUPrintfRuntimeBinding(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 3eabe087a8a33..318aad5590cda 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -111,6 +111,7 @@ MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass()) MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass()) MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass()) MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) +MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass()) MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass()) MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass()) @@ -132,7 +133,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizations DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it // already exists. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6c24fe5f1441a..b9d62cc9e4b63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -540,7 +540,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIWholeQuadModeLegacyPass(*PR); initializeSILowerControlFlowLegacyPass(*PR); initializeSIPreEmitPeepholePass(*PR); - initializeSILateBranchLoweringPass(*PR); + initializeSILateBranchLoweringLegacyPass(*PR); initializeSIMemoryLegalizerLegacyPass(*PR); initializeSIOptimizeExecMaskingLegacyPass(*PR); initializeSIPreAllocateWWMRegsLegacyPass(*PR); @@ -2161,7 +2161,8 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { // TODO: addPass(SIInsertHardClausesPass()); } - // addPass(SILateBranchLoweringPass()); + addPass(SILateBranchLoweringPass()); + if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) { // TODO: addPass(AMDGPUSetWavePriorityPass()); } diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp index d02173f57ee37..0f5b6bd9374b0 100644 --- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -16,6 +16,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachinePassManager.h" using namespace llvm; @@ -23,7 +24,7 @@ using namespace llvm; namespace { -class SILateBranchLowering : public MachineFunctionPass { +class SILateBranchLowering { private: const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; @@ -33,14 +34,23 @@ cl
[llvm-branch-commits] [llvm] [CodeGen][NPM] Port PostRAHazardRecognizer to NPM (PR #130066)
https://github.com/optimisan created https://github.com/llvm/llvm-project/pull/130066 None >From 0a495b860dfd609179c655c87c2146c6aba3c7f1 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Thu, 6 Mar 2025 06:42:54 + Subject: [PATCH] [CodeGen][NPM] Port PostRAHazardRecognizer to NPM --- .../llvm/CodeGen/PostRAHazardRecognizer.h | 26 +++ llvm/include/llvm/InitializePasses.h | 2 +- .../llvm/Passes/MachinePassRegistry.def | 1 + llvm/lib/CodeGen/CodeGen.cpp | 2 +- llvm/lib/CodeGen/PostRAHazardRecognizer.cpp | 46 +-- llvm/lib/Passes/PassBuilder.cpp | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +- .../AMDGPU/break-smem-soft-clauses.mir| 2 + llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir | 2 + .../hazard-flat-instruction-valu-check.mir| 1 + 10 files changed, 68 insertions(+), 18 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/PostRAHazardRecognizer.h diff --git a/llvm/include/llvm/CodeGen/PostRAHazardRecognizer.h b/llvm/include/llvm/CodeGen/PostRAHazardRecognizer.h new file mode 100644 index 0..3e0c04ac5e403 --- /dev/null +++ b/llvm/include/llvm/CodeGen/PostRAHazardRecognizer.h @@ -0,0 +1,26 @@ +//===- llvm/CodeGen/PostRAHazardRecognizer.h *- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#ifndef LLVM_CODEGEN_POSTRAHAZARDRECOGNIZER_H +#define LLVM_CODEGEN_POSTRAHAZARDRECOGNIZER_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class PostRAHazardRecognizerPass +: public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, +MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_POSTRAHAZARDRECOGNIZER_H diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index f1c16e3b1cb40..a3fd97ee99f3b 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -237,7 +237,7 @@ void initializePostDomViewerWrapperPassPass(PassRegistry &); void initializePostDominatorTreeWrapperPassPass(PassRegistry &); void initializePostInlineEntryExitInstrumenterPass(PassRegistry &); void initializePostMachineSchedulerLegacyPass(PassRegistry &); -void initializePostRAHazardRecognizerPass(PassRegistry &); +void initializePostRAHazardRecognizerLegacyPass(PassRegistry &); void initializePostRAMachineSinkingPass(PassRegistry &); void initializePostRASchedulerLegacyPass(PassRegistry &); void initializePreISelIntrinsicLoweringLegacyPassPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index bedbc3e88a7ce..285ad9601c6ff 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -155,6 +155,7 @@ MACHINE_FUNCTION_PASS("opt-phis", OptimizePHIsPass()) MACHINE_FUNCTION_PASS("patchable-function", PatchableFunctionPass()) MACHINE_FUNCTION_PASS("peephole-opt", PeepholeOptimizerPass()) MACHINE_FUNCTION_PASS("phi-node-elimination", PHIEliminationPass()) +MACHINE_FUNCTION_PASS("post-RA-hazard-rec", PostRAHazardRecognizerPass()) MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass(TM)) MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass(TM)) MACHINE_FUNCTION_PASS("post-ra-pseudos", ExpandPostRAPseudosPass()) diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 375176ed4b1ce..69b4d8bac94cf 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -106,7 +106,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializePatchableFunctionLegacyPass(Registry); initializePeepholeOptimizerLegacyPass(Registry); initializePostMachineSchedulerLegacyPass(Registry); - initializePostRAHazardRecognizerPass(Registry); + initializePostRAHazardRecognizerLegacyPass(Registry); initializePostRAMachineSinkingPass(Registry); initializePostRASchedulerLegacyPass(Registry); initializePreISelIntrinsicLoweringLegacyPassPass(Registry); diff --git a/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp b/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp index 97b1532300b17..3ead2087fc1d9 100644 --- a/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp +++ b/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp @@ -26,6 +26,7 @@ // //===--===// +#include "llvm/CodeGen/PostRAHazardRecognizer.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" @@ -40,30 +41,45 @@ using namespace llvm
[llvm-branch-commits] [llvm] [AMDGPU][NPM] Port GCNCreateVOPD to NPM (PR #130059)
optimisan wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/130059?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#130070** https://app.graphite.dev/github/pr/llvm/llvm-project/130070?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130069** https://app.graphite.dev/github/pr/llvm/llvm-project/130069?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130068** https://app.graphite.dev/github/pr/llvm/llvm-project/130068?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130067** https://app.graphite.dev/github/pr/llvm/llvm-project/130067?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130066** https://app.graphite.dev/github/pr/llvm/llvm-project/130066?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130065** https://app.graphite.dev/github/pr/llvm/llvm-project/130065?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130064** https://app.graphite.dev/github/pr/llvm/llvm-project/130064?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130063** https://app.graphite.dev/github/pr/llvm/llvm-project/130063?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130062** https://app.graphite.dev/github/pr/llvm/llvm-project/130062?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130061** https://app.graphite.dev/github/pr/llvm/llvm-project/130061?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130060** https://app.graphite.dev/github/pr/llvm/llvm-project/130060?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130059** https://app.graphite.dev/github/pr/llvm/llvm-project/130059?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/130059?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#129866** https://app.graphite.dev/github/pr/llvm/llvm-project/129866?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129865** https://app.graphite.dev/github/pr/llvm/llvm-project/129865?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129857** https://app.graphite.dev/github/pr/llvm/llvm-project/129857?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129853** https://app.graphite.dev/github/pr/llvm/llvm-project/129853?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129828** https://app.graphite.dev/github/pr/llvm/llvm-project/129828?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/130059 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][NPM] Cleanup AMDGPUPassRegistry.def (PR #130071)
optimisan wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/130071?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#130071** https://app.graphite.dev/github/pr/llvm/llvm-project/130071?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/130071?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#130070** https://app.graphite.dev/github/pr/llvm/llvm-project/130070?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130069** https://app.graphite.dev/github/pr/llvm/llvm-project/130069?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130068** https://app.graphite.dev/github/pr/llvm/llvm-project/130068?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130067** https://app.graphite.dev/github/pr/llvm/llvm-project/130067?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130066** https://app.graphite.dev/github/pr/llvm/llvm-project/130066?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130065** https://app.graphite.dev/github/pr/llvm/llvm-project/130065?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130064** https://app.graphite.dev/github/pr/llvm/llvm-project/130064?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130063** https://app.graphite.dev/github/pr/llvm/llvm-project/130063?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130062** https://app.graphite.dev/github/pr/llvm/llvm-project/130062?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130061** https://app.graphite.dev/github/pr/llvm/llvm-project/130061?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130060** https://app.graphite.dev/github/pr/llvm/llvm-project/130060?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130059** https://app.graphite.dev/github/pr/llvm/llvm-project/130059?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129866** https://app.graphite.dev/github/pr/llvm/llvm-project/129866?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129865** https://app.graphite.dev/github/pr/llvm/llvm-project/129865?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129857** https://app.graphite.dev/github/pr/llvm/llvm-project/129857?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129853** https://app.graphite.dev/github/pr/llvm/llvm-project/129853?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129828** https://app.graphite.dev/github/pr/llvm/llvm-project/129828?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/130071 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen][NPM] Port MachineSanitizerBinaryMetadata to NPM (PR #130069)
optimisan wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/130069?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#130071** https://app.graphite.dev/github/pr/llvm/llvm-project/130071?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130070** https://app.graphite.dev/github/pr/llvm/llvm-project/130070?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130069** https://app.graphite.dev/github/pr/llvm/llvm-project/130069?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/130069?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#130068** https://app.graphite.dev/github/pr/llvm/llvm-project/130068?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130067** https://app.graphite.dev/github/pr/llvm/llvm-project/130067?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130066** https://app.graphite.dev/github/pr/llvm/llvm-project/130066?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130065** https://app.graphite.dev/github/pr/llvm/llvm-project/130065?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130064** https://app.graphite.dev/github/pr/llvm/llvm-project/130064?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130063** https://app.graphite.dev/github/pr/llvm/llvm-project/130063?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130062** https://app.graphite.dev/github/pr/llvm/llvm-project/130062?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130061** https://app.graphite.dev/github/pr/llvm/llvm-project/130061?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130060** https://app.graphite.dev/github/pr/llvm/llvm-project/130060?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130059** https://app.graphite.dev/github/pr/llvm/llvm-project/130059?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129866** https://app.graphite.dev/github/pr/llvm/llvm-project/129866?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129865** https://app.graphite.dev/github/pr/llvm/llvm-project/129865?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129857** https://app.graphite.dev/github/pr/llvm/llvm-project/129857?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129853** https://app.graphite.dev/github/pr/llvm/llvm-project/129853?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129828** https://app.graphite.dev/github/pr/llvm/llvm-project/129828?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/130069 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen][NPM] Port StackFrameLayoutAnalysisPass to NPM (PR #130070)
optimisan wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/130070?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#130071** https://app.graphite.dev/github/pr/llvm/llvm-project/130071?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130070** https://app.graphite.dev/github/pr/llvm/llvm-project/130070?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/130070?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#130069** https://app.graphite.dev/github/pr/llvm/llvm-project/130069?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130068** https://app.graphite.dev/github/pr/llvm/llvm-project/130068?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130067** https://app.graphite.dev/github/pr/llvm/llvm-project/130067?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130066** https://app.graphite.dev/github/pr/llvm/llvm-project/130066?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130065** https://app.graphite.dev/github/pr/llvm/llvm-project/130065?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130064** https://app.graphite.dev/github/pr/llvm/llvm-project/130064?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130063** https://app.graphite.dev/github/pr/llvm/llvm-project/130063?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130062** https://app.graphite.dev/github/pr/llvm/llvm-project/130062?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130061** https://app.graphite.dev/github/pr/llvm/llvm-project/130061?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130060** https://app.graphite.dev/github/pr/llvm/llvm-project/130060?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130059** https://app.graphite.dev/github/pr/llvm/llvm-project/130059?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129866** https://app.graphite.dev/github/pr/llvm/llvm-project/129866?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129865** https://app.graphite.dev/github/pr/llvm/llvm-project/129865?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129857** https://app.graphite.dev/github/pr/llvm/llvm-project/129857?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129853** https://app.graphite.dev/github/pr/llvm/llvm-project/129853?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129828** https://app.graphite.dev/github/pr/llvm/llvm-project/129828?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/130070 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen][NPM] Port PostRAHazardRecognizer to NPM (PR #130066)
optimisan wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/130066?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#130071** https://app.graphite.dev/github/pr/llvm/llvm-project/130071?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130070** https://app.graphite.dev/github/pr/llvm/llvm-project/130070?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130069** https://app.graphite.dev/github/pr/llvm/llvm-project/130069?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130068** https://app.graphite.dev/github/pr/llvm/llvm-project/130068?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130067** https://app.graphite.dev/github/pr/llvm/llvm-project/130067?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130066** https://app.graphite.dev/github/pr/llvm/llvm-project/130066?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/130066?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#130065** https://app.graphite.dev/github/pr/llvm/llvm-project/130065?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130064** https://app.graphite.dev/github/pr/llvm/llvm-project/130064?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130063** https://app.graphite.dev/github/pr/llvm/llvm-project/130063?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130062** https://app.graphite.dev/github/pr/llvm/llvm-project/130062?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130061** https://app.graphite.dev/github/pr/llvm/llvm-project/130061?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130060** https://app.graphite.dev/github/pr/llvm/llvm-project/130060?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130059** https://app.graphite.dev/github/pr/llvm/llvm-project/130059?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129866** https://app.graphite.dev/github/pr/llvm/llvm-project/129866?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129865** https://app.graphite.dev/github/pr/llvm/llvm-project/129865?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129857** https://app.graphite.dev/github/pr/llvm/llvm-project/129857?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129853** https://app.graphite.dev/github/pr/llvm/llvm-project/129853?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129828** https://app.graphite.dev/github/pr/llvm/llvm-project/129828?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/130066 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][NPM] Port GCNCreateVOPD to NPM (PR #130059)
https://github.com/optimisan created https://github.com/llvm/llvm-project/pull/130059 None >From d82b6dd57dcae61ba7790c7681dc5ae3a5d7fbbd Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Wed, 5 Mar 2025 10:52:00 + Subject: [PATCH] [AMDGPU][NPM] Port GCNCreateVOPD to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 7 ++- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +- llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp | 53 --- 4 files changed, 43 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 57297288eecb4..f208a8bb9964b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -358,6 +358,11 @@ class SIModeRegisterPass : public PassInfoMixin { PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM); }; +class GCNCreateVOPDPass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &AM); +}; + FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); ModulePass *createAMDGPUPrintfRuntimeBinding(); @@ -443,7 +448,7 @@ extern char &SIFormMemoryClausesID; void initializeSIPostRABundlerLegacyPass(PassRegistry &); extern char &SIPostRABundlerLegacyID; -void initializeGCNCreateVOPDPass(PassRegistry &); +void initializeGCNCreateVOPDLegacyPass(PassRegistry &); extern char &GCNCreateVOPDID; void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 1050855176c04..0e3dcb4267ede 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -103,6 +103,7 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass()) +MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass()) MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index ce3dcd920bce3..73ae9135eb319 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -546,7 +546,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIPreAllocateWWMRegsLegacyPass(*PR); initializeSIFormMemoryClausesLegacyPass(*PR); initializeSIPostRABundlerLegacyPass(*PR); - initializeGCNCreateVOPDPass(*PR); + initializeGCNCreateVOPDLegacyPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); @@ -2149,7 +2149,7 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const { void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) { -// TODO: addPass(GCNCreateVOPDPass()); +addPass(GCNCreateVOPDPass()); } // TODO: addPass(SIMemoryLegalizerPass()); // TODO: addPass(SIInsertWaitcntsPass()); diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp index d40a1a2a10d9b..614262e817162 100644 --- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "gcn-create-vopd" @@ -36,7 +37,7 @@ using namespace llvm; namespace { -class GCNCreateVOPD : public MachineFunctionPass { +class GCNCreateVOPD { private: class VOPDCombineInfo { public: @@ -49,20 +50,8 @@ class GCNCreateVOPD : public MachineFunctionPass { }; public: - static char ID; const GCNSubtarget *ST = nullptr; - GCNCreateVOPD() : MachineFunctionPass(ID) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { -AU.setPreservesCFG(); -MachineFunctionPass::getAnalysisUsage(AU); - } - - StringRef getPassName() const override { -return "GCN Create VOPD Instructions"; - } - bool doReplace(const SIInstrInfo *SII, VOPDCombineInfo &CI) { auto *FirstMI = CI.FirstMI; auto *SecondMI = CI.SecondMI; @@ -112,9 +101,7 @@ class GCNCreateVOPD : public MachineFunctionPass { return true; } - bool runOnMachineFunction(MachineFunction &MF) override { -if (skipFunction(MF.getFunction())) - return false; + b
[llvm-branch-commits] [llvm] [CodeGen][NPM] Port BranchRelaxation to NPM (PR #130067)
optimisan wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/130067?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#130069** https://app.graphite.dev/github/pr/llvm/llvm-project/130069?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130068** https://app.graphite.dev/github/pr/llvm/llvm-project/130068?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130067** https://app.graphite.dev/github/pr/llvm/llvm-project/130067?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/130067?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#130066** https://app.graphite.dev/github/pr/llvm/llvm-project/130066?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130065** https://app.graphite.dev/github/pr/llvm/llvm-project/130065?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130064** https://app.graphite.dev/github/pr/llvm/llvm-project/130064?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130063** https://app.graphite.dev/github/pr/llvm/llvm-project/130063?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130062** https://app.graphite.dev/github/pr/llvm/llvm-project/130062?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130061** https://app.graphite.dev/github/pr/llvm/llvm-project/130061?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130060** https://app.graphite.dev/github/pr/llvm/llvm-project/130060?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130059** https://app.graphite.dev/github/pr/llvm/llvm-project/130059?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129866** https://app.graphite.dev/github/pr/llvm/llvm-project/129866?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129865** https://app.graphite.dev/github/pr/llvm/llvm-project/129865?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129857** https://app.graphite.dev/github/pr/llvm/llvm-project/129857?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129853** https://app.graphite.dev/github/pr/llvm/llvm-project/129853?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129828** https://app.graphite.dev/github/pr/llvm/llvm-project/129828?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/130067 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR mode (PR #130037)
@@ -0,0 +1,356 @@ +# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT +# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR + +--- | arsenm wrote: Is the IR section needed just for the calling conventions? https://github.com/llvm/llvm-project/pull/130037 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen][StaticDataSplitter]Support constant pool partitioning (PR #129781)
@@ -148,17 +184,9 @@ bool StaticDataSplitter::partitionStaticDataWithProfiles(MachineFunction &MF) { if (MJTI->updateJumpTableEntryHotness(JTI, Hotness)) ++NumChangedJumpTables; -} else { - // Find global variables with local linkage. - const GlobalVariable *GV = - getLocalLinkageGlobalVariable(Op.getGlobal()); - // Skip 'special' global variables conservatively because they are - // often handled specially, and skip those not in static data - // sections. - if (!GV || GV->getName().starts_with("llvm.") || - !inStaticDataSection(GV, TM)) -continue; - SDPI->addConstantProfileCount(GV, Count); +} else if (const Constant *C = + getConstant(Op, MF.getTarget(), MF.getConstantPool())) { + SDPI->addConstantProfileCount(C, Count); williamweixiao wrote: yes, we don't need to update "NumChangedJumpTables" here. but why do we need to return "true" or "false" for this function? https://github.com/llvm/llvm-project/pull/129781 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][NPM] Port SIInsertHardClauses to NPM (PR #130062)
https://github.com/optimisan created https://github.com/llvm/llvm-project/pull/130062 None >From 88d2174897b3a23ea0a7d8b1e915eb99c0992696 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Thu, 6 Mar 2025 04:52:38 + Subject: [PATCH] [AMDGPU][NPM] Port SIInsertHardClauses to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 8 ++- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- .../lib/Target/AMDGPU/SIInsertHardClauses.cpp | 50 +-- .../CodeGen/AMDGPU/hard-clauses-img-gfx10.mir | 1 + .../CodeGen/AMDGPU/hard-clauses-img-gfx11.mir | 1 + 6 files changed, 46 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index dbd81add85753..635cdf7ef12df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -376,6 +376,12 @@ class SIInsertWaitcntsPass : public PassInfoMixin { static bool isRequired() { return true; } }; +class SIInsertHardClausesPass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, +MachineFunctionAnalysisManager &MFAM); +}; + FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); ModulePass *createAMDGPUPrintfRuntimeBinding(); @@ -449,7 +455,7 @@ extern char &SIModeRegisterID; void initializeAMDGPUInsertDelayAluLegacyPass(PassRegistry &); extern char &AMDGPUInsertDelayAluID; -void initializeSIInsertHardClausesPass(PassRegistry &); +void initializeSIInsertHardClausesLegacyPass(PassRegistry &); extern char &SIInsertHardClausesID; void initializeSIInsertWaitcntsLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index c4641cba60e53..3eabe087a8a33 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -109,6 +109,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass()); MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass()) MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass()) +MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass()) MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass()) MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass()) @@ -131,7 +132,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizations DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass()) DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c3cc1dc6e495b..6c24fe5f1441a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -534,7 +534,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowLegacyPass(*PR); initializeAMDGPUInsertDelayAluLegacyPass(*PR); - initializeSIInsertHardClausesPass(*PR); + initializeSIInsertHardClausesLegacyPass(*PR); initializeSIInsertWaitcntsLegacyPass(*PR); initializeSIModeRegisterLegacyPass(*PR); initializeSIWholeQuadModeLegacyPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp index dcc60765cc203..71b937f23cc3c 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -36,6 +36,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePassManager.h" using namespace llvm; @@ -89,18 +90,10 @@ enum HardClauseType { HARDCLAUSE_ILLEGAL, }; -class SIInsertHardClauses : public MachineFunctionPass { +class SIInsertHardClauses { public: - static char ID; const GCNSubtarget *ST = nullptr; - SIInsertHardClauses() : MachineFunctionPass(ID) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { -AU.setPreservesCFG(); -MachineFunctionPass::getAnalysisUsage(AU); - } - HardClauseType getHardClauseType(const MachineInstr &MI) { if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) { if (ST->getGeneration() == AMDGPUSubtarget::GFX10) { @@ -189,9 +182,7 @@ class SIInsertHardClauses : public
[llvm-branch-commits] [llvm] [AMDGPU][NPM] Port SIPreEmitPeephole to NPM (PR #130065)
optimisan wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/130065?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#130070** https://app.graphite.dev/github/pr/llvm/llvm-project/130070?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130069** https://app.graphite.dev/github/pr/llvm/llvm-project/130069?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130068** https://app.graphite.dev/github/pr/llvm/llvm-project/130068?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130067** https://app.graphite.dev/github/pr/llvm/llvm-project/130067?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130066** https://app.graphite.dev/github/pr/llvm/llvm-project/130066?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130065** https://app.graphite.dev/github/pr/llvm/llvm-project/130065?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/130065?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#130064** https://app.graphite.dev/github/pr/llvm/llvm-project/130064?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130063** https://app.graphite.dev/github/pr/llvm/llvm-project/130063?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130062** https://app.graphite.dev/github/pr/llvm/llvm-project/130062?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130061** https://app.graphite.dev/github/pr/llvm/llvm-project/130061?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130060** https://app.graphite.dev/github/pr/llvm/llvm-project/130060?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130059** https://app.graphite.dev/github/pr/llvm/llvm-project/130059?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129866** https://app.graphite.dev/github/pr/llvm/llvm-project/129866?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129865** https://app.graphite.dev/github/pr/llvm/llvm-project/129865?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129857** https://app.graphite.dev/github/pr/llvm/llvm-project/129857?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129853** https://app.graphite.dev/github/pr/llvm/llvm-project/129853?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#129828** https://app.graphite.dev/github/pr/llvm/llvm-project/129828?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/130065 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][NPM] Cleanup AMDGPUPassRegistry.def (PR #130071)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/130071 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Update divergence lowering tests (PR #128702)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/128702 >From c844e3ac372ec27d57c6d5aad3567426a460936f Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 28 Feb 2025 15:54:55 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Update divergence lowering tests In preparations for implementing temporal divergence lowering for global-isel, switch llvm-ir tests for amdgpu divergence lowering to new reg bank select. Requires adding few simple regbanklegalize rules for these tests to work. --- .../Target/AMDGPU/AMDGPURegBankLegalize.cpp | 6 + .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 28 +- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 39 +- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 5 + ...-divergent-i1-phis-no-lane-mask-merging.ll | 97 ++-- .../GlobalISel/divergence-structurizer.ll | 418 ++ .../divergence-temporal-divergent-i1.ll | 398 ++--- .../divergence-temporal-divergent-i1.mir | 400 + .../divergence-temporal-divergent-reg.ll | 57 ++- .../divergence-temporal-divergent-reg.mir | 71 +++ 10 files changed, 1187 insertions(+), 332 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index 8d3e7829e10e1..eb2ece7bece51 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -312,6 +312,12 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { } // Opcodes that also support S1. +if (Opc == G_FREEZE && +MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) { + RBLHelper.applyMappingTrivial(*MI); + continue; +} + if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT || Opc == AMDGPU::G_IMPLICIT_DEF)) { Register Dst = MI->getOperand(0).getReg(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 3c007987b8494..3383175fc1bdb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -134,6 +134,26 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, switch (Mapping.LoweringMethod) { case DoNotLower: return; + case VccExtToSel: { +LLT Ty = MRI.getType(MI.getOperand(0).getReg()); +Register Src = MI.getOperand(1).getReg(); +unsigned Opc = MI.getOpcode(); +if (Ty == S32 || Ty == S16) { + auto True = B.buildConstant({VgprRB, Ty}, Opc == G_SEXT ? -1 : 1); + auto False = B.buildConstant({VgprRB, Ty}, 0); + B.buildSelect(MI.getOperand(0).getReg(), Src, True, False); +} +if (Ty == S64) { + auto True = B.buildConstant({VgprRB, S32}, Opc == G_SEXT ? -1 : 1); + auto False = B.buildConstant({VgprRB, S32}, 0); + auto Sel = B.buildSelect({VgprRB, S32}, Src, True, False); + B.buildMergeValues( + MI.getOperand(0).getReg(), + {Sel.getReg(0), Opc == G_SEXT ? Sel.getReg(0) : False.getReg(0)}); +} +MI.eraseFromParent(); +return; + } case UniExtToSel: { LLT Ty = MRI.getType(MI.getOperand(0).getReg()); auto True = B.buildConstant({SgprRB, Ty}, @@ -276,6 +296,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case Sgpr64: case Vgpr64: return LLT::scalar(64); + case VgprP0: +return LLT::pointer(0, 64); case SgprP1: case VgprP1: return LLT::pointer(1, 64); @@ -383,6 +405,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { return SgprRB; case Vgpr32: case Vgpr64: + case VgprP0: case VgprP1: case VgprP3: case VgprP4: @@ -425,6 +448,7 @@ void RegBankLegalizeHelper::applyMappingDst( case SgprV4S32: case Vgpr32: case Vgpr64: +case VgprP0: case VgprP1: case VgprP3: case VgprP4: @@ -555,6 +579,7 @@ void RegBankLegalizeHelper::applyMappingSrc( // vgpr scalars, pointers and vectors case Vgpr32: case Vgpr64: +case VgprP0: case VgprP1: case VgprP3: case VgprP4: @@ -653,7 +678,8 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { // We accept all types that can fit in some register class. // Uniform G_PHIs have all sgpr registers. // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr. - if (Ty == LLT::scalar(32) || Ty == LLT::pointer(4, 64)) { + if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) || + Ty == LLT::pointer(4, 64)) { return; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 33018ae9677a3..6ee15709d2fa6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -50,6 +50,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID U
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/124298 >From f084882197a92f537c38ec19dfabdafdd9f15d09 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 28 Feb 2025 15:56:04 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 44 +++- llvm/include/llvm/ADT/GenericUniformityInfo.h | 5 ++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 6 +-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 52 ++- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 25 +++-- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 +++ .../divergence-temporal-divergent-reg.ll | 38 +++--- .../divergence-temporal-divergent-reg.mir | 8 +-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++--- 12 files changed, 176 insertions(+), 54 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e08..51e9ac30391fe 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -51,6 +51,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "uniformity" @@ -342,6 +343,9 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -396,6 +400,11 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + SmallVector TemporalDivergenceList; + + void recordTemporalDivergence(ConstValueRefT, const InstructionT *, +const CycleT *); + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -1129,6 +1138,13 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl::recordTemporalDivergence( +ConstValueRefT Val, const InstructionT *User, const CycleT *Cycle) { + TemporalDivergenceList.emplace_back(Val, const_cast(User), + Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1146,6 +1162,12 @@ template void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { bool haveDivergentArgs = false; + // When we print Value, LLVM IR instruction, we want to print extra new line. + // In LLVM IR print function for Value does not print new line at the end. + // In MIR print for MachineInstr prints new line at the end. + constexpr bool IsMIR = std::is_same::value; + std::string NewLine = IsMIR ? "" : "\n"; + // Control flow instructions may be divergent even if their inputs are // uniform. Thus, although exceedingly rare, it is possible to have a program // with no divergent values but with divergent control structures. @@ -1180,6 +1202,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!TemporalDivergenceList.empty()) { +OS << "\nTEMPORAL DIVERGENCE LIST:\n"; + +for (auto [Val, UseInst, Cycle] : TemporalDivergenceList) { + OS << "Value :" << Context.print(Val) << NewLine + << "Used by :" << Context.print(UseInst) << NewLine + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1191,7 +1223,7 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { OS << " DIVERGENT: "; else OS << " "; - OS << Context.print(value) << '\n'; + OS << Context.print(value) << NewLine; } OS << "TERMINATORS\n"; @@ -1203,13 +1235,21 @@ void GenericUniformityAnalysisImpl::prin
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Update divergence lowering tests (PR #128702)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/128702 >From c844e3ac372ec27d57c6d5aad3567426a460936f Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 28 Feb 2025 15:54:55 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Update divergence lowering tests In preparations for implementing temporal divergence lowering for global-isel, switch llvm-ir tests for amdgpu divergence lowering to new reg bank select. Requires adding few simple regbanklegalize rules for these tests to work. --- .../Target/AMDGPU/AMDGPURegBankLegalize.cpp | 6 + .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 28 +- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 39 +- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 5 + ...-divergent-i1-phis-no-lane-mask-merging.ll | 97 ++-- .../GlobalISel/divergence-structurizer.ll | 418 ++ .../divergence-temporal-divergent-i1.ll | 398 ++--- .../divergence-temporal-divergent-i1.mir | 400 + .../divergence-temporal-divergent-reg.ll | 57 ++- .../divergence-temporal-divergent-reg.mir | 71 +++ 10 files changed, 1187 insertions(+), 332 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index 8d3e7829e10e1..eb2ece7bece51 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -312,6 +312,12 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { } // Opcodes that also support S1. +if (Opc == G_FREEZE && +MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) { + RBLHelper.applyMappingTrivial(*MI); + continue; +} + if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT || Opc == AMDGPU::G_IMPLICIT_DEF)) { Register Dst = MI->getOperand(0).getReg(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 3c007987b8494..3383175fc1bdb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -134,6 +134,26 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, switch (Mapping.LoweringMethod) { case DoNotLower: return; + case VccExtToSel: { +LLT Ty = MRI.getType(MI.getOperand(0).getReg()); +Register Src = MI.getOperand(1).getReg(); +unsigned Opc = MI.getOpcode(); +if (Ty == S32 || Ty == S16) { + auto True = B.buildConstant({VgprRB, Ty}, Opc == G_SEXT ? -1 : 1); + auto False = B.buildConstant({VgprRB, Ty}, 0); + B.buildSelect(MI.getOperand(0).getReg(), Src, True, False); +} +if (Ty == S64) { + auto True = B.buildConstant({VgprRB, S32}, Opc == G_SEXT ? -1 : 1); + auto False = B.buildConstant({VgprRB, S32}, 0); + auto Sel = B.buildSelect({VgprRB, S32}, Src, True, False); + B.buildMergeValues( + MI.getOperand(0).getReg(), + {Sel.getReg(0), Opc == G_SEXT ? Sel.getReg(0) : False.getReg(0)}); +} +MI.eraseFromParent(); +return; + } case UniExtToSel: { LLT Ty = MRI.getType(MI.getOperand(0).getReg()); auto True = B.buildConstant({SgprRB, Ty}, @@ -276,6 +296,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case Sgpr64: case Vgpr64: return LLT::scalar(64); + case VgprP0: +return LLT::pointer(0, 64); case SgprP1: case VgprP1: return LLT::pointer(1, 64); @@ -383,6 +405,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { return SgprRB; case Vgpr32: case Vgpr64: + case VgprP0: case VgprP1: case VgprP3: case VgprP4: @@ -425,6 +448,7 @@ void RegBankLegalizeHelper::applyMappingDst( case SgprV4S32: case Vgpr32: case Vgpr64: +case VgprP0: case VgprP1: case VgprP3: case VgprP4: @@ -555,6 +579,7 @@ void RegBankLegalizeHelper::applyMappingSrc( // vgpr scalars, pointers and vectors case Vgpr32: case Vgpr64: +case VgprP0: case VgprP1: case VgprP3: case VgprP4: @@ -653,7 +678,8 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { // We accept all types that can fit in some register class. // Uniform G_PHIs have all sgpr registers. // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr. - if (Ty == LLT::scalar(32) || Ty == LLT::pointer(4, 64)) { + if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) || + Ty == LLT::pointer(4, 64)) { return; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 33018ae9677a3..6ee15709d2fa6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -50,6 +50,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID U
[llvm-branch-commits] [llvm] [AMDGPU][NPM] Cleanup AMDGPUPassRegistry.def (PR #130071)
https://github.com/optimisan edited https://github.com/llvm/llvm-project/pull/130071 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR mode (PR #130037)
@@ -0,0 +1,356 @@ +# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT +# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR + +--- | arsenm wrote: No https://github.com/llvm/llvm-project/pull/130037 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs (PR #130047)
https://github.com/rovka edited https://github.com/llvm/llvm-project/pull/130047 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] PeepholeOpt: Remove subreg def check for bitcast (PR #130086)
llvmbot wrote: @llvm/pr-subscribers-backend-x86 Author: Matt Arsenault (arsenm) Changes Subregister defs are illegal in SSA. Surprisingly this enables folding into subregister insert patterns in one test. --- Full diff: https://github.com/llvm/llvm-project/pull/130086.diff 2 Files Affected: - (modified) llvm/lib/CodeGen/PeepholeOptimizer.cpp (+4-5) - (modified) llvm/test/CodeGen/X86/pr41619.ll (-2) ``diff diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 4d0fd86eb216f..ec8e97f73546a 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -1923,11 +1923,8 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() { // Bitcasts with more than one def are not supported. if (Def->getDesc().getNumDefs() != 1) return ValueTrackerResult(); - const MachineOperand DefOp = Def->getOperand(DefIdx); - if (DefOp.getSubReg() != DefSubReg) -// If we look for a different subreg, it means we want a subreg of the src. -// Bails as we do not support composing subregs yet. -return ValueTrackerResult(); + + assert(!Def->getOperand(DefIdx).getSubReg() && "no subregister defs in SSA"); unsigned SrcIdx = Def->getNumOperands(); for (unsigned OpIdx = DefIdx + 1, EndOpIdx = SrcIdx; OpIdx != EndOpIdx; @@ -1950,6 +1947,8 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() { if (SrcIdx >= Def->getNumOperands()) return ValueTrackerResult(); + const MachineOperand &DefOp = Def->getOperand(DefIdx); + // Stop when any user of the bitcast is a SUBREG_TO_REG, replacing with a COPY // will break the assumed guarantees for the upper bits. for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(DefOp.getReg())) { diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll index 88dcd7798f0c3..6bca77d05e9a9 100644 --- a/llvm/test/CodeGen/X86/pr41619.ll +++ b/llvm/test/CodeGen/X86/pr41619.ll @@ -6,8 +6,6 @@ define void @foo(double %arg) { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb ; CHECK-NEXT:vmovq %xmm0, %rax -; CHECK-NEXT:vmovd %eax, %xmm0 -; CHECK-NEXT:vmovq %xmm0, %rax ; CHECK-NEXT:movl %eax, (%rax) ; CHECK-NEXT:movq $0, (%rax) ; CHECK-NEXT:retq `` https://github.com/llvm/llvm-project/pull/130086 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] PeepholeOpt: Remove subreg def check for bitcast (PR #130086)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/130086?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#130086** https://app.graphite.dev/github/pr/llvm/llvm-project/130086?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/130086?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#130085** https://app.graphite.dev/github/pr/llvm/llvm-project/130085?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130084** https://app.graphite.dev/github/pr/llvm/llvm-project/130084?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/130086 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] PeepholeOpt: Remove subreg def check for insert_subreg (PR #130085)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/130085?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#130086** https://app.graphite.dev/github/pr/llvm/llvm-project/130086?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#130085** https://app.graphite.dev/github/pr/llvm/llvm-project/130085?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/130085?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#130084** https://app.graphite.dev/github/pr/llvm/llvm-project/130084?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/130085 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] PeepholeOpt: Remove subreg def check for bitcast (PR #130086)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/130086 Subregister defs are illegal in SSA. Surprisingly this enables folding into subregister insert patterns in one test. >From c7d08110c29c0c37c198fa02b953767eaf68a3be Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 6 Mar 2025 18:50:32 +0700 Subject: [PATCH] PeepholeOpt: Remove subreg def check for bitcast Subregister defs are illegal in SSA. Surprisingly this enables folding into subregister insert patterns in one test. --- llvm/lib/CodeGen/PeepholeOptimizer.cpp | 9 - llvm/test/CodeGen/X86/pr41619.ll | 2 -- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 4d0fd86eb216f..ec8e97f73546a 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -1923,11 +1923,8 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() { // Bitcasts with more than one def are not supported. if (Def->getDesc().getNumDefs() != 1) return ValueTrackerResult(); - const MachineOperand DefOp = Def->getOperand(DefIdx); - if (DefOp.getSubReg() != DefSubReg) -// If we look for a different subreg, it means we want a subreg of the src. -// Bails as we do not support composing subregs yet. -return ValueTrackerResult(); + + assert(!Def->getOperand(DefIdx).getSubReg() && "no subregister defs in SSA"); unsigned SrcIdx = Def->getNumOperands(); for (unsigned OpIdx = DefIdx + 1, EndOpIdx = SrcIdx; OpIdx != EndOpIdx; @@ -1950,6 +1947,8 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() { if (SrcIdx >= Def->getNumOperands()) return ValueTrackerResult(); + const MachineOperand &DefOp = Def->getOperand(DefIdx); + // Stop when any user of the bitcast is a SUBREG_TO_REG, replacing with a COPY // will break the assumed guarantees for the upper bits. for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(DefOp.getReg())) { diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll index 88dcd7798f0c3..6bca77d05e9a9 100644 --- a/llvm/test/CodeGen/X86/pr41619.ll +++ b/llvm/test/CodeGen/X86/pr41619.ll @@ -6,8 +6,6 @@ define void @foo(double %arg) { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb ; CHECK-NEXT:vmovq %xmm0, %rax -; CHECK-NEXT:vmovd %eax, %xmm0 -; CHECK-NEXT:vmovq %xmm0, %rax ; CHECK-NEXT:movl %eax, (%rax) ; CHECK-NEXT:movq $0, (%rax) ; CHECK-NEXT:retq ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] PeepholeOpt: Remove subreg def check for bitcast (PR #130086)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/130086 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] PeepholeOpt: Remove subreg def check for insert_subreg (PR #130085)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/130085 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT] Skip out-of-range pending relocations (PR #116964)
paschalis-mpeis wrote: Hey Maksim, Extending Relocations is even better. Thanks for the suggestion and the review. Before proceeding, and regarding the size overheads, I want to highlight an inconsistency with LLVM’s ObjectFile, where the type is 64 bits ([see here](https://github.com/llvm/llvm-project/blob/16cd5cdf4d6387e34d2bb723bc26c331c8d89d75/llvm/include/llvm/Object/ObjectFile.h#L628)). We only have 3 inlined sites of this in `RewriteInstance` (eg one is [here](https://github.com/llvm/llvm-project/blob/3c357a49d61e4c81a1ac016502ee504521bc8dda/bolt/lib/Rewrite/RewriteInstance.cpp#L2408)). If you agree, I'll proceed with an NFCI change, adding assertion overflow checks at these sites. https://github.com/llvm/llvm-project/pull/116964 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Dynamic VGPR support for llvm.amdgcn.cs.chain (PR #130094)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Diana Picus (rovka) Changes The llvm.amdgcn.cs.chain intrinsic has a 'flags' operand which may indicate that we want to reallocate the VGPRs before performing the call. A call with the following arguments: ``` llvm.amdgcn.cs.chain %callee, %exec, %sgpr_args, %vgpr_args, /*flags*/0x1, %num_vgprs, %fallback_exec, %fallback_callee ``` is supposed to do the following: - copy the SGPR and VGPR args into their respective registers - try to change the VGPR allocation - if the allocation has succeeded, set EXEC to %exec and jump to %callee, otherwise set EXEC to %fallback_exec and jump to %fallback_callee This patch implements the dynamic VGPR behaviour by generating an S_ALLOC_VGPR followed by S_CSELECT_B32/64 instructions for the EXEC and callee. The rest of the call sequence is left undisturbed (i.e. identical to the case where the flags are 0 and we don't use dynamic VGPRs). We achieve this by introducing some new pseudos (SI_CS_CHAIN_TC_Wn_DVGPR) which are expanded in the SILateBranchLowering pass, just like the simpler SI_CS_CHAIN_TC_Wn pseudos. The main reason is so that we don't risk other passes (particularly the PostRA scheduler) introducing instructions between the S_ALLOC_VGPR and the jump. Such instructions might end up using VGPRs that have been deallocated, or the wrong EXEC mask. Once the whole backend treats S_ALLOC_VGPR and changes to EXEC as barriers for instructions that use VGPRs, we could in principle move the expansion earlier (but in the absence of a good reason for that my personal preference is to keep it later in order to make debugging easier). Since the expansion happens after register allocation, we're careful to select constants to immediate operands instead of letting ISel generate S_MOVs which could interfere with register allocation (i.e. make it look like we need more registers than we actually do). For GFX12, S_ALLOC_VGPR only works in wave32 mode, so we bail out during ISel in wave64 mode. However, we can define the pseudos for wave64 too so it's easy to handle if future generations support it. --- Patch is 94.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130094.diff 11 Files Affected: - (modified) llvm/include/llvm/CodeGen/SelectionDAGISel.h (+15-14) - (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (+9-4) - (modified) llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp (+95-31) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+66-22) - (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+51-25) - (modified) llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp (+55-7) - (added) llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll (+97) - (modified) llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll (+12-24) - (modified) llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll (+12-24) - (added) llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-intrinsic-dyn-vgpr-w32.ll (+315) - (added) llvm/test/CodeGen/AMDGPU/remove-register-flags.mir (+19) ``diff diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h index e9452a6dc6233..55f8f19d437a0 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h @@ -328,20 +328,21 @@ class SelectionDAGISel { }; enum { -OPFL_None = 0, // Node has no chain or glue input and isn't variadic. -OPFL_Chain = 1, // Node has a chain input. -OPFL_GlueInput = 2, // Node has a glue input. -OPFL_GlueOutput = 4, // Node has a glue output. -OPFL_MemRefs= 8, // Node gets accumulated MemRefs. -OPFL_Variadic0 = 1<<4, // Node is variadic, root has 0 fixed inputs. -OPFL_Variadic1 = 2<<4, // Node is variadic, root has 1 fixed inputs. -OPFL_Variadic2 = 3<<4, // Node is variadic, root has 2 fixed inputs. -OPFL_Variadic3 = 4<<4, // Node is variadic, root has 3 fixed inputs. -OPFL_Variadic4 = 5<<4, // Node is variadic, root has 4 fixed inputs. -OPFL_Variadic5 = 6<<4, // Node is variadic, root has 5 fixed inputs. -OPFL_Variadic6 = 7<<4, // Node is variadic, root has 6 fixed inputs. - -OPFL_VariadicInfo = OPFL_Variadic6 +OPFL_None = 0, // Node has no chain or glue input and isn't variadic. +OPFL_Chain = 1, // Node has a chain input. +OPFL_GlueInput = 2, // Node has a glue input. +OPFL_GlueOutput = 4, // Node has a glue output. +OPFL_MemRefs = 8,// Node gets accumulated MemRefs. +OPFL_Variadic0 = 1 << 4, // Node is variadic, root has 0 fixed inputs. +OPFL_Variadic1 = 2 << 4, // Node is variadic, root has 1 fixed inputs. +OPFL_Variadic2 = 3 << 4, // Node is variadic, root has 2 fixed inputs. +OPFL_Variadic3 = 4 << 4, // Node is variadic, root has 3 fixed inputs. +OPFL_Variadic4 = 5 << 4, // Node is vari
[llvm-branch-commits] [llvm] 76911bf - Revert "Revert "[LTO][Pipelines][Coro] De-duplicate Coro passes" (#129977)"
Author: Vitaly Buka Date: 2025-03-06T07:57:35-08:00 New Revision: 76911bfffd150a5c5ef0f8ec54ba526ffc09cafb URL: https://github.com/llvm/llvm-project/commit/76911bfffd150a5c5ef0f8ec54ba526ffc09cafb DIFF: https://github.com/llvm/llvm-project/commit/76911bfffd150a5c5ef0f8ec54ba526ffc09cafb.diff LOG: Revert "Revert "[LTO][Pipelines][Coro] De-duplicate Coro passes" (#129977)" This reverts commit 3ccacc4e44afa66f20dd6430bc7ff966cc670708. Added: Modified: llvm/lib/Passes/PassBuilderPipelines.cpp llvm/test/LTO/X86/coro.ll llvm/test/Other/new-pm-defaults.ll llvm/test/Other/new-pm-lto-defaults.ll Removed: diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 07db107325f02..546a5eb1ec283 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -419,14 +419,16 @@ static bool isLTOPostLink(ThinOrFullLTOPhase Phase) { // Helper to wrap conditionally Coro passes. static CoroConditionalWrapper buildCoroWrapper(ThinOrFullLTOPhase Phase) { - // TODO: Skip passes according to Phase. ModulePassManager CoroPM; - CoroPM.addPass(CoroEarlyPass()); - CGSCCPassManager CGPM; - CGPM.addPass(CoroSplitPass()); - CoroPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); - CoroPM.addPass(CoroCleanupPass()); - CoroPM.addPass(GlobalDCEPass()); + if (!isLTOPostLink(Phase)) +CoroPM.addPass(CoroEarlyPass()); + if (!isLTOPreLink(Phase)) { +CGSCCPassManager CGPM; +CGPM.addPass(CoroSplitPass()); +CoroPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); +CoroPM.addPass(CoroCleanupPass()); +CoroPM.addPass(GlobalDCEPass()); + } return CoroConditionalWrapper(std::move(CoroPM)); } @@ -1010,7 +1012,7 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level, MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( RequireAnalysisPass())); - if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink) { + if (!isLTOPreLink(Phase)) { MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0)); MainCGPipeline.addPass(CoroAnnotationElidePass()); } @@ -1060,7 +1062,7 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level, buildFunctionSimplificationPipeline(Level, Phase), PTO.EagerlyInvalidateAnalyses)); - if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink) { + if (!isLTOPreLink(Phase)) { MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( CoroSplitPass(Level != OptimizationLevel::O0))); MPM.addPass( @@ -1120,7 +1122,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, // Do basic inference of function attributes from known properties of system // libraries and other oracles. MPM.addPass(InferFunctionAttrsPass()); -MPM.addPass(CoroEarlyPass()); +if (!isLTOPostLink(Phase)) + MPM.addPass(CoroEarlyPass()); FunctionPassManager EarlyFPM; EarlyFPM.addPass(EntryExitInstrumenterPass(/*PostInlining=*/false)); @@ -1290,7 +1293,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, // and argument promotion. MPM.addPass(DeadArgumentEliminationPass()); - if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink) + if (!isLTOPreLink(Phase)) MPM.addPass(CoroCleanupPass()); // Optimize globals now that functions are fully simplified. @@ -1955,9 +1958,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, return MPM; } - // TODO: Skip to match buildCoroWrapper. - MPM.addPass(CoroEarlyPass()); - // Optimize globals to try and fold them into constants. MPM.addPass(GlobalOptPass()); diff --git a/llvm/test/LTO/X86/coro.ll b/llvm/test/LTO/X86/coro.ll index cde398dd76d85..f9830d964bc69 100644 --- a/llvm/test/LTO/X86/coro.ll +++ b/llvm/test/LTO/X86/coro.ll @@ -1,4 +1,6 @@ -; RUN: llvm-as %s -o %t1.bc +; RUN: opt %s -passes='lto-pre-link' -S -o %t1.ll +; RUN: FileCheck %s --check-prefixes=CHECK,PRELINK --implicit-check-not="call void @llvm.coro" --input-file=%t1.ll +; RUN: llvm-as %t1.ll -o %t1.bc ; RUN: llvm-lto2 run %t1.bc -o %t2.o -r=%t1.bc,test,plx -r=%t1.bc,extern_func,plx -save-temps ; RUN: llvm-dis %t2.o.0.5.precodegen.bc -o - | FileCheck %s --implicit-check-not="call void @llvm.coro" @@ -7,7 +9,9 @@ target triple = "x86_64-unknown-fuchsia" declare void @extern_func() -; CHECK: define {{.*}} void @test( +; CHECK: define{{.*}} void @test( +; PRELINK: call ptr @llvm.coro.subfn.addr +; PRELINK: call ptr @llvm.coro.subfn.addr define void @test(ptr %hdl) { call void @llvm.coro.resume(ptr %hdl) call void @llvm.coro.destroy(ptr %hdl) diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll index c554fdbf4c799..30ff1a5879df2 100644 --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -230,13 +230
[llvm-branch-commits] [libclc] release/20.x: [libclc] Stop installing CLC headers (#126908) (PR #130017)
llvmbot wrote: @arsenm What do you think about merging this PR to the release branch? https://github.com/llvm/llvm-project/pull/130017 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][NPM] Port SIInsertWaitcnts to NPM (PR #130061)
https://github.com/optimisan created https://github.com/llvm/llvm-project/pull/130061 None >From 10605a79e1d1c6d1c227b98019fd4a4c568345b8 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Thu, 6 Mar 2025 04:41:08 + Subject: [PATCH] [AMDGPU][NPM] Port SIInsertWaitcnts to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 91 +-- llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir | 1 + .../CodeGen/AMDGPU/insert-waitcnts-hang.mir | 1 + .../AMDGPU/vccz-corrupt-bug-workaround.mir| 2 + 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 23b9aa0cf0523..dbd81add85753 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -369,6 +369,13 @@ class SIMemoryLegalizerPass : public PassInfoMixin { MachineFunctionAnalysisManager &MFAM); }; +class SIInsertWaitcntsPass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, +MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); ModulePass *createAMDGPUPrintfRuntimeBinding(); @@ -445,7 +452,7 @@ extern char &AMDGPUInsertDelayAluID; void initializeSIInsertHardClausesPass(PassRegistry &); extern char &SIInsertHardClausesID; -void initializeSIInsertWaitcntsPass(PassRegistry&); +void initializeSIInsertWaitcntsLegacyPass(PassRegistry &); extern char &SIInsertWaitcntsID; void initializeSIFormMemoryClausesLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index de959f8a2aa62..c4641cba60e53 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -109,6 +109,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass()); MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass()) MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass()) +MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass()) MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass()) MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass()) @@ -131,7 +132,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartial DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index dbe212ad0a216..c3cc1dc6e495b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -535,7 +535,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIAnnotateControlFlowLegacyPass(*PR); initializeAMDGPUInsertDelayAluLegacyPass(*PR); initializeSIInsertHardClausesPass(*PR); - initializeSIInsertWaitcntsPass(*PR); + initializeSIInsertWaitcntsLegacyPass(*PR); initializeSIModeRegisterLegacyPass(*PR); initializeSIWholeQuadModeLegacyPass(*PR); initializeSILowerControlFlowLegacyPass(*PR); @@ -2153,7 +2153,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { } addPass(SIMemoryLegalizerPass()); - // TODO: addPass(SIInsertWaitcntsPass()); + addPass(SIInsertWaitcntsPass()); // TODO: addPass(SIModeRegisterPass()); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ee263f58bcaf2..8951a4144bd68 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -33,6 +33,7 @@ #include "llvm/ADT/Sequence.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/Support/DebugCounter.h" #include "llvm/TargetParser/TargetParser.h" @@ -594,7 +595,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; -class SIInsertWaitcnts : public MachineFunctionPass { +class SIInsertWaitcnts { pri
[llvm-branch-commits] [llvm] [DirectX] Updating DXContainer documentation to add Root Descriptors (PR #129759)
https://github.com/joaosaffran edited https://github.com/llvm/llvm-project/pull/129759 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR mode (PR #130037)
@@ -0,0 +1,356 @@ +# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT +# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR arsenm wrote: Don't need -O2 https://github.com/llvm/llvm-project/pull/130037 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs (PR #130047)
https://github.com/kerbowa edited https://github.com/llvm/llvm-project/pull/130047 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs (PR #130047)
@@ -1452,6 +1452,16 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { if (WavesAfter < DAG.MinOccupancy) return true; + // For dynamic VGPR mode, we don't want to waste any VGPR blocks. + if (ST.isDynamicVGPREnabled()) { kerbowa wrote: Should VGPR critical/excess limits also be updated to reflect that you never want to allocate a new block? In this case it will be treated as if it's as bad as spilling. https://github.com/llvm/llvm-project/pull/130047 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs (PR #130047)
https://github.com/kerbowa commented: Is there any test for the revert scheduling portion of the change? https://github.com/llvm/llvm-project/pull/130047 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [DirectX] Updating Root Signature documentation with Descriptor table description (PR #129797)
https://github.com/joaosaffran updated https://github.com/llvm/llvm-project/pull/129797 >From 82a7de3b1a22eb7f7630d5b2d6998916ede45a8c Mon Sep 17 00:00:00 2001 From: joaosaffran <126493771+joaosaff...@users.noreply.github.com> Date: Tue, 4 Mar 2025 14:32:03 -0800 Subject: [PATCH 1/2] Updating Root Descriptor documentation --- llvm/docs/DirectX/DXContainer.rst | 70 +++ 1 file changed, 70 insertions(+) diff --git a/llvm/docs/DirectX/DXContainer.rst b/llvm/docs/DirectX/DXContainer.rst index b9a2067368e0f..fd1ff3f04a008 100644 --- a/llvm/docs/DirectX/DXContainer.rst +++ b/llvm/docs/DirectX/DXContainer.rst @@ -544,3 +544,73 @@ Version 1.1 Root Descriptor The Version 1.1 RootDescriptor_V1_1 extends the base structure with the following additional fields: #. **Flags**: Provides additional metadata about the descriptor's usage pattern. + +Root Descriptor Table +~ + +Descriptor tables provide a flexible mechanism for grouping and managing multiple resource descriptors within +a single root signature parameter. They enable efficient binding of complex shader resource sets while minimizing +root signature space consumption. + +.. code-block:: cpp + + struct DescriptorRange_V1_0 { + dxbc::DescriptorRangeType RangeType; + uint32_t NumDescriptors; + uint32_t BaseShaderRegister; + uint32_t RegisterSpace; + uint32_t OffsetInDescriptorsFromTableStart; + }; + + struct DescriptorRange_V1_1 { + dxbc::DescriptorRangeType RangeType; + uint32_t NumDescriptors; + uint32_t BaseShaderRegister; + uint32_t RegisterSpace; + uint32_t OffsetInDescriptorsFromTableStart; + Copy// New flags for Version 1.1 + enum Flags { +None= 0x0, +// Descriptors are static and known at root signature creation +DESCRIPTORS_STATIC = 0x1, +// Descriptors remain constant during command list execution +DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS = 0x2, +// Descriptors may change frequently +DESCRIPTORS_VOLATILE= 0x4 + }; + + // Bitfield of flags from the Flags enum + uint32_t Flags; + }; + + struct RootDescriptorTable { + uint32_t NumDescriptorRanges; + uint32_t DescriptorRangesOffset; + }; + + +Descriptor Range Version 1.0 + + +The Version 1.0 ``DescriptorRange_V1_0`` provides basic descriptor range definition: + +#. **RangeType**: Type of descriptors (CBV, SRV, UAV, or Sampler) +#. **NumDescriptors**: Number of descriptors in the range +#. **BaseShaderRegister**: First shader register in the range +#. **RegisterSpace**: Register space for the range +#. **OffsetInDescriptorsFromTableStart**: Offset from the descriptor heap start + +Descriptor Range Version 1.1 + +The Version 1.1 DescriptorRange_V1_1 extends the base structure with performance optimization flags. + +#. **Flags**: Provide additional information about the descriptors and enable further driver optimizations. + +Root Descriptor Table +' + +RootDescriptorTable provides basic table structure: + +#. **NumDescriptorRanges**: Number of descriptor ranges +#. **DescriptorRangesOffset**: Offset to descriptor range array + >From 16e3642a23540edb2e30689655ef0e9722f2 Mon Sep 17 00:00:00 2001 From: joaosaffran <126493771+joaosaff...@users.noreply.github.com> Date: Thu, 6 Mar 2025 11:10:41 -0800 Subject: [PATCH 2/2] Update DXContainer.rst --- llvm/docs/DirectX/DXContainer.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/DirectX/DXContainer.rst b/llvm/docs/DirectX/DXContainer.rst index fd1ff3f04a008..0c652ed65c528 100644 --- a/llvm/docs/DirectX/DXContainer.rst +++ b/llvm/docs/DirectX/DXContainer.rst @@ -568,7 +568,7 @@ root signature space consumption. uint32_t BaseShaderRegister; uint32_t RegisterSpace; uint32_t OffsetInDescriptorsFromTableStart; - Copy// New flags for Version 1.1 + // New flags for Version 1.1 enum Flags { None= 0x0, // Descriptors are static and known at root signature creation ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [llvm] [flang][OpenMP] Parse cancel-directive-name as clause (PR #130146)
https://github.com/kparzysz updated https://github.com/llvm/llvm-project/pull/130146 >From 98df18461bb06afa06b8968b157a3c5a5cf50324 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Thu, 6 Mar 2025 08:51:34 -0600 Subject: [PATCH 1/2] [flang][OpenMP] Parse cancel-directive-name as clause The cancellable construct names on CANCEL or CANCELLATION POINT directives are actually clauses (with the same names as the corresponding constructs). Instead of parsing them into a custom structure, parse them as a clause, which will make CANCEL/CANCELLATION POINT follow the same uniform scheme as other constructs ( [()] [clauses]). --- flang/include/flang/Parser/dump-parse-tree.h | 2 +- flang/include/flang/Parser/parse-tree.h | 11 +- flang/lib/Parser/openmp-parsers.cpp | 31 - flang/lib/Parser/unparse.cpp | 5 +- flang/lib/Semantics/check-omp-structure.cpp | 133 +-- flang/lib/Semantics/check-omp-structure.h| 5 +- flang/test/Semantics/OpenMP/cancel.f90 | 29 llvm/include/llvm/Frontend/OpenMP/OMP.td | 5 + 8 files changed, 169 insertions(+), 52 deletions(-) create mode 100644 flang/test/Semantics/OpenMP/cancel.f90 diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index fcd902d25fa40..004e22a21ecfa 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -546,6 +546,7 @@ class ParseTreeDumper { #define GEN_FLANG_DUMP_PARSE_TREE_CLAUSES #include "llvm/Frontend/OpenMP/OMP.inc" NODE(parser, OmpClauseList) + NODE(parser, OmpCancellationConstructTypeClause) NODE(parser, OmpContainsClause) NODE(parser, OmpCriticalDirective) NODE(parser, OmpErrorDirective) @@ -689,7 +690,6 @@ class ParseTreeDumper { NODE(parser, OpenMPAtomicConstruct) NODE(parser, OpenMPBlockConstruct) NODE(parser, OpenMPCancelConstruct) - NODE(OpenMPCancelConstruct, If) NODE(parser, OpenMPCancellationPointConstruct) NODE(parser, OpenMPConstruct) NODE(parser, OpenMPCriticalConstruct) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index a197249ebae91..cb0eb884e1193 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -4048,6 +4048,12 @@ struct OmpBindClause { WRAPPER_CLASS_BOILERPLATE(OmpBindClause, Binding); }; +// Artificial clause to represent a cancellable construct. +struct OmpCancellationConstructTypeClause { + TUPLE_CLASS_BOILERPLATE(OmpCancellationConstructTypeClause); + std::tuple> t; +}; + // Ref: [5.2:214] // // contains-clause -> @@ -4870,15 +4876,14 @@ struct OmpCancelType { struct OpenMPCancellationPointConstruct { TUPLE_CLASS_BOILERPLATE(OpenMPCancellationPointConstruct); CharBlock source; - std::tuple t; + std::tuple t; }; // 2.14.1 cancel -> CANCEL construct-type-clause [ [,] if-clause] struct OpenMPCancelConstruct { TUPLE_CLASS_BOILERPLATE(OpenMPCancelConstruct); - WRAPPER_CLASS(If, ScalarLogicalExpr); CharBlock source; - std::tuple> t; + std::tuple t; }; // Ref: [5.0:254-255], [5.1:287-288], [5.2:322-323] diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 80831db0e7d50..51b2567a3894d 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -803,8 +803,9 @@ TYPE_PARSER(construct(many(maybe(","_tok) >> TYPE_PARSER(construct(many(maybe(","_tok) >> construct(unwrap(OmpDirectiveNameParser{}) -TYPE_PARSER("ABSENT" >> construct(construct( -parenthesized(Parser{}))) || +TYPE_PARSER( // +"ABSENT" >> construct(construct( +parenthesized(Parser{}))) || "ACQUIRE" >> construct(construct()) || "ACQ_REL" >> construct(construct()) || "AFFINITY" >> construct(construct( @@ -981,7 +982,24 @@ TYPE_PARSER("ABSENT" >> construct(construct( "UPDATE" >> construct(construct( parenthesized(Parser{}))) || "WHEN" >> construct(construct( - parenthesized(Parser{} + parenthesized(Parser{}))) || +// Cancellable constructs +construct(construct( +construct( // +construct(verbatim("DO"_id)), +maybe(parenthesized(scalarLogicalExpr) || +construct(construct( +construct( // +construct(verbatim("PARALLEL"_id)), +maybe(parenthesized(scalarLogicalExpr) || +construct(construct( +construct( // +construct(verbatim("SECTIONS"_id)), +maybe(parenthesized(scalarLogicalExpr) || +construct(construct( +construct( // +construct(verbatim("TASKGROUP"_id)), +maybe(parenthesized(scalarLogicalExpr)) // [Clause, [Clause], ...] TYPE_PARSER(sourced(construct( @@ -1104,11 +1122,11 @@ TYPE_PARSER(sourced(construct( // 2.14.2 Cancellation Point
[llvm-branch-commits] [flang] [flang][OpenMP] Map simple `do concurrent` loops to OpenMP host constructs (PR #127633)
ergawy wrote: Ping! Please have a look when you have time. https://github.com/llvm/llvm-project/pull/127633 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Implement bitcode autoupgrade for old style enqueue blocks (PR #128520)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/128520 >From d2479a3b4f3613a01fb62658b6fd67d28561fd55 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 17 Nov 2023 14:21:52 +0900 Subject: [PATCH] AMDGPU: Implement bitcode autoupgrade for old style enqueue blocks Introduces circular dependency in build for appendToUsed, and I'm not sure it's worth the trouble to fix it. We can most likely get away without upgrading this. We could move appendToUsed / appendToCompilerUsed directly to be in Module. --- llvm/lib/IR/AutoUpgrade.cpp | 49 +++ llvm/lib/IR/CMakeLists.txt| 1 + .../amdgpu-autoupgrade-enqueued-block.ll | 138 ++ 3 files changed, 188 insertions(+) create mode 100644 llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index cb4ecc60aa473..0e4e135e90972 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -48,6 +48,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Regex.h" #include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" #include #include #include @@ -5518,6 +5519,51 @@ struct AMDGPUUnsafeFPAtomicsUpgradeVisitor }; } // namespace +static StructType *getAMDGPURuntimeHandleType(LLVMContext &C, + Type *KernelDescriptorPtrTy) { + Type *Int32 = Type::getInt32Ty(C); + return StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32}, +"block.runtime.handle.t"); +} + +/// Rewrite to new scheme for enqueued block lowering +static void upgradeAMDGPUKernelEnqueuedBlock(Function &F) { + if (F.isMaterializable()) { +// A verifier error is produced if we add metadata to the function during +// linking. +return; + } + + const StringLiteral EnqueuedBlockName("enqueued-block"); + if (!F.hasFnAttribute(EnqueuedBlockName)) +return; + + F.removeFnAttr(EnqueuedBlockName); + + Module *M = F.getParent(); + LLVMContext &Ctx = M->getContext(); + const DataLayout &DL = M->getDataLayout(); + + StructType *HandleTy = getAMDGPURuntimeHandleType( + Ctx, PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace())); + + Twine RuntimeHandleName = F.getName() + ".runtime.handle"; + + auto *RuntimeHandle = new GlobalVariable( + *M, HandleTy, + /*isConstant=*/true, F.getLinkage(), + /*Initializer=*/ConstantAggregateZero::get(HandleTy), RuntimeHandleName, + /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, + DL.getDefaultGlobalsAddressSpace(), + /*isExternallyInitialized=*/true); + RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle"); + + MDNode *HandleAsMD = MDNode::get(Ctx, ValueAsMetadata::get(RuntimeHandle)); + F.setMetadata(LLVMContext::MD_associated, HandleAsMD); + + appendToUsed(*M, {&F, RuntimeHandle}); +} + void llvm::UpgradeFunctionAttributes(Function &F) { // If a function definition doesn't have the strictfp attribute, // convert any callsite strictfp attributes to nobuiltin. @@ -5558,6 +5604,9 @@ void llvm::UpgradeFunctionAttributes(Function &F) { F.removeFnAttr("amdgpu-unsafe-fp-atomics"); } } + + if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) +upgradeAMDGPUKernelEnqueuedBlock(F); } static bool isOldLoopArgument(Metadata *MD) { diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt index eb00829fd8c70..a78c58c807f6a 100644 --- a/llvm/lib/IR/CMakeLists.txt +++ b/llvm/lib/IR/CMakeLists.txt @@ -92,6 +92,7 @@ add_llvm_component_library(LLVMCore LINK_COMPONENTS BinaryFormat Demangle + TransformUtils Remarks Support TargetParser diff --git a/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll b/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll new file mode 100644 index 0..41521c1f2025d --- /dev/null +++ b/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll @@ -0,0 +1,138 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +%struct.ndrange_t = type { i32 } +%opencl.queue_t = type opaque + +; CHECK: %block.runtime.handle.t = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.0 = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.1 = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.2 = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.3 = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.4 = type { ptr, i32, i32 } + + +; CHECK: @kernel_address_user = global [1 x ptr] [ptr @block_has_used_kernel_address] +; CHECK: @__test_block_invoke_kernel.runtime.handle = internal externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle" +; CHECK: @__test_block_invoke_2_kernel.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.0 zeroinitializer, section ".amdgpu.kernel.runtime.handle" +; CHECK: @block_has_used_k
[llvm-branch-commits] [flang] [llvm] [flang][OpenMP] Parse cancel-directive-name as clause (PR #130146)
https://github.com/kiranchandramohan approved this pull request. LG. https://github.com/llvm/llvm-project/pull/130146 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [CUDA][HIP] fix virtual dtor host/device attr (PR #130126)
https://github.com/yxsamliu edited https://github.com/llvm/llvm-project/pull/130126 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][OpenMP] Accept old FLUSH syntax in METADIRECTIVE (PR #130122)
https://github.com/kparzysz updated https://github.com/llvm/llvm-project/pull/130122 >From bf56b8c80a0f1a7e06dcd3e898172c27e5afabf5 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Wed, 5 Mar 2025 08:24:30 -0600 Subject: [PATCH 1/2] [flang][OpenMP] Accept old FLUSH syntax in METADIRECTIVE Accommodate it in OmpDirectiveSpecification, which may become the primary component of the actual FLUSH construct in the future. --- flang/include/flang/Parser/dump-parse-tree.h | 1 + flang/include/flang/Parser/parse-tree.h | 6 ++- flang/lib/Parser/openmp-parsers.cpp | 32 +-- flang/lib/Parser/unparse.cpp | 28 +++--- .../Parser/OpenMP/metadirective-flush.f90 | 54 +++ 5 files changed, 109 insertions(+), 12 deletions(-) create mode 100644 flang/test/Parser/OpenMP/metadirective-flush.f90 diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index a154794e41e9d..fcd902d25fa40 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -491,6 +491,7 @@ class ParseTreeDumper { NODE(OmpWhenClause, Modifier) NODE(parser, OmpDirectiveName) NODE(parser, OmpDirectiveSpecification) + NODE_ENUM(OmpDirectiveSpecification, Flags) NODE(parser, OmpTraitPropertyName) NODE(parser, OmpTraitScore) NODE(parser, OmpTraitPropertyExtension) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 346299b8e5215..a197249ebae91 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -4503,13 +4503,15 @@ struct OmpClauseList { // --- Directives and constructs struct OmpDirectiveSpecification { - CharBlock source; + ENUM_CLASS(Flags, None, DeprecatedSyntax); TUPLE_CLASS_BOILERPLATE(OmpDirectiveSpecification); llvm::omp::Directive DirId() const { // return std::get(t).v; } + + CharBlock source; std::tuple>, - std::optional> + std::optional, Flags> t; }; diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index b3e76d70c8064..0de7690b90262 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -995,10 +995,34 @@ TYPE_PARSER(sourced(construct( // --- Parsers for directives and constructs -- -TYPE_PARSER(sourced(construct( // -sourced(OmpDirectiveNameParser{}), -maybe(parenthesized(nonemptyList(Parser{}))), -maybe(Parser{} +OmpDirectiveSpecification static makeFlushFromOldSyntax1( +Verbatim &&text, std::optional &&clauses, +std::optional> &&args, +OmpDirectiveSpecification::Flags &&flags) { + return OmpDirectiveSpecification{OmpDirectiveName(text), std::move(args), + std::move(clauses), std::move(flags)}; +} + +TYPE_PARSER(sourced( +// Parse the old syntax: FLUSH [clauses] [(objects)] +construct( // +// Force this old-syntax parser to fail for FLUSH followed by '('. +// Otherwise it could succeed on the new syntax but have one of +// lists absent in the parsed result. +// E.g. for FLUSH(x) SEQ_CST it would find no clauses following +// the directive name, parse the argument list "(x)" and stop. +applyFunction(makeFlushFromOldSyntax1, +verbatim("FLUSH"_tok) / !lookAhead("("_tok), +maybe(Parser{}), +maybe(parenthesized(nonemptyList(Parser{}))), +pure(OmpDirectiveSpecification::Flags::DeprecatedSyntax))) || +// Parse the standard syntax: directive [(arguments)] [clauses] +construct( // +sourced(OmpDirectiveNameParser{}), +maybe(parenthesized(nonemptyList(Parser{}))), +maybe(Parser{}), +pure(OmpDirectiveSpecification::Flags::None)) +)) TYPE_PARSER(sourced(construct("NOTHING" >> ok))) diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 4f5c05dc2aa25..262077e62441b 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2094,14 +2094,30 @@ class UnparseVisitor { Word(llvm::omp::getOpenMPDirectiveName(x).str()); } void Unparse(const OmpDirectiveSpecification &x) { -using ArgList = std::list; +auto unparseArgs{[&]() { + using ArgList = std::list; + if (auto &args{std::get>(x.t)}) { +Put("("); +Walk(*args); +Put(")"); + } +}}; +auto unparseClauses{[&]() { + Walk(std::get>(x.t)); +}}; + Walk(std::get(x.t)); -if (auto &args{std::get>(x.t)}) { - Put("("); - Walk(*args); - Put(")"); +auto flags{std::get(x.t)}; +if (flags == OmpDirectiveSpecification::Flags::DeprecatedSyntax) { + if (x.DirId() == llvm::omp::Directive::OMPD_flush) { +// FLUSH clause arglist +unparseClauses(); +unparseArgs(); + } +} else { + unparseArgs();
[llvm-branch-commits] [llvm] AMDGPU: Implement bitcode autoupgrade for old style enqueue blocks (PR #128520)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/128520 >From d2479a3b4f3613a01fb62658b6fd67d28561fd55 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 17 Nov 2023 14:21:52 +0900 Subject: [PATCH] AMDGPU: Implement bitcode autoupgrade for old style enqueue blocks Introduces circular dependency in build for appendToUsed, and I'm not sure it's worth the trouble to fix it. We can most likely get away without upgrading this. We could move appendToUsed / appendToCompilerUsed directly to be in Module. --- llvm/lib/IR/AutoUpgrade.cpp | 49 +++ llvm/lib/IR/CMakeLists.txt| 1 + .../amdgpu-autoupgrade-enqueued-block.ll | 138 ++ 3 files changed, 188 insertions(+) create mode 100644 llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index cb4ecc60aa473..0e4e135e90972 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -48,6 +48,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Regex.h" #include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" #include #include #include @@ -5518,6 +5519,51 @@ struct AMDGPUUnsafeFPAtomicsUpgradeVisitor }; } // namespace +static StructType *getAMDGPURuntimeHandleType(LLVMContext &C, + Type *KernelDescriptorPtrTy) { + Type *Int32 = Type::getInt32Ty(C); + return StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32}, +"block.runtime.handle.t"); +} + +/// Rewrite to new scheme for enqueued block lowering +static void upgradeAMDGPUKernelEnqueuedBlock(Function &F) { + if (F.isMaterializable()) { +// A verifier error is produced if we add metadata to the function during +// linking. +return; + } + + const StringLiteral EnqueuedBlockName("enqueued-block"); + if (!F.hasFnAttribute(EnqueuedBlockName)) +return; + + F.removeFnAttr(EnqueuedBlockName); + + Module *M = F.getParent(); + LLVMContext &Ctx = M->getContext(); + const DataLayout &DL = M->getDataLayout(); + + StructType *HandleTy = getAMDGPURuntimeHandleType( + Ctx, PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace())); + + Twine RuntimeHandleName = F.getName() + ".runtime.handle"; + + auto *RuntimeHandle = new GlobalVariable( + *M, HandleTy, + /*isConstant=*/true, F.getLinkage(), + /*Initializer=*/ConstantAggregateZero::get(HandleTy), RuntimeHandleName, + /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, + DL.getDefaultGlobalsAddressSpace(), + /*isExternallyInitialized=*/true); + RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle"); + + MDNode *HandleAsMD = MDNode::get(Ctx, ValueAsMetadata::get(RuntimeHandle)); + F.setMetadata(LLVMContext::MD_associated, HandleAsMD); + + appendToUsed(*M, {&F, RuntimeHandle}); +} + void llvm::UpgradeFunctionAttributes(Function &F) { // If a function definition doesn't have the strictfp attribute, // convert any callsite strictfp attributes to nobuiltin. @@ -5558,6 +5604,9 @@ void llvm::UpgradeFunctionAttributes(Function &F) { F.removeFnAttr("amdgpu-unsafe-fp-atomics"); } } + + if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) +upgradeAMDGPUKernelEnqueuedBlock(F); } static bool isOldLoopArgument(Metadata *MD) { diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt index eb00829fd8c70..a78c58c807f6a 100644 --- a/llvm/lib/IR/CMakeLists.txt +++ b/llvm/lib/IR/CMakeLists.txt @@ -92,6 +92,7 @@ add_llvm_component_library(LLVMCore LINK_COMPONENTS BinaryFormat Demangle + TransformUtils Remarks Support TargetParser diff --git a/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll b/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll new file mode 100644 index 0..41521c1f2025d --- /dev/null +++ b/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll @@ -0,0 +1,138 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +%struct.ndrange_t = type { i32 } +%opencl.queue_t = type opaque + +; CHECK: %block.runtime.handle.t = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.0 = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.1 = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.2 = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.3 = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.4 = type { ptr, i32, i32 } + + +; CHECK: @kernel_address_user = global [1 x ptr] [ptr @block_has_used_kernel_address] +; CHECK: @__test_block_invoke_kernel.runtime.handle = internal externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle" +; CHECK: @__test_block_invoke_2_kernel.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.0 zeroinitializer, section ".amdgpu.kernel.runtime.handle" +; CHECK: @block_has_used_k
[llvm-branch-commits] [llvm] PeepholeOpt: Remove subreg def check for bitcast (PR #130086)
https://github.com/qcolombet approved this pull request. https://github.com/llvm/llvm-project/pull/130086 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits