[llvm-branch-commits] [llvm] DAG: Move soft float predicate management into RuntimeLibcalls (PR #142905)
llvmbot wrote: @llvm/pr-subscribers-llvm-selectiondag Author: Matt Arsenault (arsenm) Changes Work towards making RuntimeLibcalls the centralized location for all libcall information. This requires changing the encoding from tracking the ISD::CondCode to using CmpInst::Predicate. --- Patch is 35.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142905.diff 6 Files Affected: - (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+6-8) - (modified) llvm/include/llvm/IR/RuntimeLibcalls.h (+25) - (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+3-2) - (modified) llvm/lib/IR/RuntimeLibcalls.cpp (+36) - (modified) llvm/lib/Target/ARM/ARMISelLowering.cpp (+89-89) - (modified) llvm/lib/Target/MSP430/MSP430ISelLowering.cpp (+65-65) ``diff diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 9c453f51e129d..0d157de479141 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3572,20 +3572,18 @@ class LLVM_ABI TargetLoweringBase { /// Override the default CondCode to be used to test the result of the /// comparison libcall against zero. - /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD. - void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) { -CmpLibcallCCs[Call] = CC; + /// FIXME: This should be removed + void setCmpLibcallCC(RTLIB::Libcall Call, CmpInst::Predicate Pred) { +Libcalls.setSoftFloatCmpLibcallPredicate(Call, Pred); } - /// Get the CondCode that's to be used to test the result of the comparison /// libcall against zero. - /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD. - ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const { -return CmpLibcallCCs[Call]; + CmpInst::Predicate + getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const { +return Libcalls.getSoftFloatCmpLibcallPredicate(Call); } - /// Set the CallingConv that should be used for the specified libcall. void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) { Libcalls.setLibcallCallingConv(Call, CC); diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index 26c085031a48a..6cc65fabfcc99 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -16,6 +16,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Compiler.h" #include "llvm/TargetParser/Triple.h" @@ -73,6 +74,20 @@ struct RuntimeLibcallsInfo { LibcallRoutineNames + RTLIB::UNKNOWN_LIBCALL); } + /// Get the comparison predicate that's to be used to test the result of the + /// comparison libcall against zero. This should only be used with + /// floating-point compare libcalls. + CmpInst::Predicate + getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const { +return SoftFloatCompareLibcallPredicates[Call]; + } + + // FIXME: This should be removed. This should be private constant. + void setSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call, + CmpInst::Predicate Pred) { +SoftFloatCompareLibcallPredicates[Call] = Pred; + } + private: /// Stores the name each libcall. const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1]; @@ -80,6 +95,14 @@ struct RuntimeLibcallsInfo { /// Stores the CallingConv that should be used for each libcall. CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL]; + /// The condition type that should be used to test the result of each of the + /// soft floating-point comparison libcall against integer zero. + /// + // FIXME: This is only relevant for the handful of floating-point comparison + // runtime calls; it's excessive to have a table entry for every single + // opcode. + CmpInst::Predicate SoftFloatCompareLibcallPredicates[RTLIB::UNKNOWN_LIBCALL]; + static bool darwinHasSinCos(const Triple &TT) { assert(TT.isOSDarwin() && "should be called with darwin triple"); // Don't bother with 32 bit x86. @@ -95,6 +118,8 @@ struct RuntimeLibcallsInfo { return true; } + void initSoftFloatCmpLibcallPredicates(); + /// Set default libcall names. If a target wants to opt-out of a libcall it /// should be placed here. LLVM_ABI void initLibcalls(const Triple &TT); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a3c4cb4ea0582..5b1c6236afc32 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/CodeGen/Analysis.h"
[llvm-branch-commits] [llvm] DAG: Move soft float predicate management into RuntimeLibcalls (PR #142905)
llvmbot wrote: @llvm/pr-subscribers-backend-arm Author: Matt Arsenault (arsenm) Changes Work towards making RuntimeLibcalls the centralized location for all libcall information. This requires changing the encoding from tracking the ISD::CondCode to using CmpInst::Predicate. --- Patch is 35.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142905.diff 6 Files Affected: - (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+6-8) - (modified) llvm/include/llvm/IR/RuntimeLibcalls.h (+25) - (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+3-2) - (modified) llvm/lib/IR/RuntimeLibcalls.cpp (+36) - (modified) llvm/lib/Target/ARM/ARMISelLowering.cpp (+89-89) - (modified) llvm/lib/Target/MSP430/MSP430ISelLowering.cpp (+65-65) ``diff diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 9c453f51e129d..0d157de479141 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3572,20 +3572,18 @@ class LLVM_ABI TargetLoweringBase { /// Override the default CondCode to be used to test the result of the /// comparison libcall against zero. - /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD. - void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) { -CmpLibcallCCs[Call] = CC; + /// FIXME: This should be removed + void setCmpLibcallCC(RTLIB::Libcall Call, CmpInst::Predicate Pred) { +Libcalls.setSoftFloatCmpLibcallPredicate(Call, Pred); } - /// Get the CondCode that's to be used to test the result of the comparison /// libcall against zero. - /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD. - ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const { -return CmpLibcallCCs[Call]; + CmpInst::Predicate + getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const { +return Libcalls.getSoftFloatCmpLibcallPredicate(Call); } - /// Set the CallingConv that should be used for the specified libcall. void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) { Libcalls.setLibcallCallingConv(Call, CC); diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index 26c085031a48a..6cc65fabfcc99 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -16,6 +16,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Compiler.h" #include "llvm/TargetParser/Triple.h" @@ -73,6 +74,20 @@ struct RuntimeLibcallsInfo { LibcallRoutineNames + RTLIB::UNKNOWN_LIBCALL); } + /// Get the comparison predicate that's to be used to test the result of the + /// comparison libcall against zero. This should only be used with + /// floating-point compare libcalls. + CmpInst::Predicate + getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const { +return SoftFloatCompareLibcallPredicates[Call]; + } + + // FIXME: This should be removed. This should be private constant. + void setSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call, + CmpInst::Predicate Pred) { +SoftFloatCompareLibcallPredicates[Call] = Pred; + } + private: /// Stores the name each libcall. const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1]; @@ -80,6 +95,14 @@ struct RuntimeLibcallsInfo { /// Stores the CallingConv that should be used for each libcall. CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL]; + /// The condition type that should be used to test the result of each of the + /// soft floating-point comparison libcall against integer zero. + /// + // FIXME: This is only relevant for the handful of floating-point comparison + // runtime calls; it's excessive to have a table entry for every single + // opcode. + CmpInst::Predicate SoftFloatCompareLibcallPredicates[RTLIB::UNKNOWN_LIBCALL]; + static bool darwinHasSinCos(const Triple &TT) { assert(TT.isOSDarwin() && "should be called with darwin triple"); // Don't bother with 32 bit x86. @@ -95,6 +118,8 @@ struct RuntimeLibcallsInfo { return true; } + void initSoftFloatCmpLibcallPredicates(); + /// Set default libcall names. If a target wants to opt-out of a libcall it /// should be placed here. LLVM_ABI void initLibcalls(const Triple &TT); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a3c4cb4ea0582..5b1c6236afc32 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/CodeGen/Analysis.h" #incl
[llvm-branch-commits] [llvm] DAG: Move soft float predicate management into RuntimeLibcalls (PR #142905)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/142905 Work towards making RuntimeLibcalls the centralized location for all libcall information. This requires changing the encoding from tracking the ISD::CondCode to using CmpInst::Predicate. >From 0b1472786bc8e235718db055745c20d66ade0510 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 5 Jun 2025 14:22:55 +0900 Subject: [PATCH] DAG: Move soft float predicate management into RuntimeLibcalls Work towards making RuntimeLibcalls the centralized location for all libcall information. This requires changing the encoding from tracking the ISD::CondCode to using CmpInst::Predicate. --- llvm/include/llvm/CodeGen/TargetLowering.h| 14 +- llvm/include/llvm/IR/RuntimeLibcalls.h| 25 +++ .../CodeGen/SelectionDAG/TargetLowering.cpp | 5 +- llvm/lib/IR/RuntimeLibcalls.cpp | 36 llvm/lib/Target/ARM/ARMISelLowering.cpp | 178 +- llvm/lib/Target/MSP430/MSP430ISelLowering.cpp | 130 ++--- 6 files changed, 224 insertions(+), 164 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 9c453f51e129d..0d157de479141 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3572,20 +3572,18 @@ class LLVM_ABI TargetLoweringBase { /// Override the default CondCode to be used to test the result of the /// comparison libcall against zero. - /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD. - void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) { -CmpLibcallCCs[Call] = CC; + /// FIXME: This should be removed + void setCmpLibcallCC(RTLIB::Libcall Call, CmpInst::Predicate Pred) { +Libcalls.setSoftFloatCmpLibcallPredicate(Call, Pred); } - /// Get the CondCode that's to be used to test the result of the comparison /// libcall against zero. - /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD. - ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const { -return CmpLibcallCCs[Call]; + CmpInst::Predicate + getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const { +return Libcalls.getSoftFloatCmpLibcallPredicate(Call); } - /// Set the CallingConv that should be used for the specified libcall. void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) { Libcalls.setLibcallCallingConv(Call, CC); diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index 26c085031a48a..6cc65fabfcc99 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -16,6 +16,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Compiler.h" #include "llvm/TargetParser/Triple.h" @@ -73,6 +74,20 @@ struct RuntimeLibcallsInfo { LibcallRoutineNames + RTLIB::UNKNOWN_LIBCALL); } + /// Get the comparison predicate that's to be used to test the result of the + /// comparison libcall against zero. This should only be used with + /// floating-point compare libcalls. + CmpInst::Predicate + getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const { +return SoftFloatCompareLibcallPredicates[Call]; + } + + // FIXME: This should be removed. This should be private constant. + void setSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call, + CmpInst::Predicate Pred) { +SoftFloatCompareLibcallPredicates[Call] = Pred; + } + private: /// Stores the name each libcall. const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1]; @@ -80,6 +95,14 @@ struct RuntimeLibcallsInfo { /// Stores the CallingConv that should be used for each libcall. CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL]; + /// The condition type that should be used to test the result of each of the + /// soft floating-point comparison libcall against integer zero. + /// + // FIXME: This is only relevant for the handful of floating-point comparison + // runtime calls; it's excessive to have a table entry for every single + // opcode. + CmpInst::Predicate SoftFloatCompareLibcallPredicates[RTLIB::UNKNOWN_LIBCALL]; + static bool darwinHasSinCos(const Triple &TT) { assert(TT.isOSDarwin() && "should be called with darwin triple"); // Don't bother with 32 bit x86. @@ -95,6 +118,8 @@ struct RuntimeLibcallsInfo { return true; } + void initSoftFloatCmpLibcallPredicates(); + /// Set default libcall names. If a target wants to opt-out of a libcall it /// should be placed here. LLVM_ABI void initLibcalls(const Triple &TT); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a3c4cb4ea0582..5b1c6236afc32 100644 --- a/llvm
[llvm-branch-commits] [llvm] DAG: Move soft float predicate management into RuntimeLibcalls (PR #142905)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142905?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142905** https://app.graphite.dev/github/pr/llvm/llvm-project/142905?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142905?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142898** https://app.graphite.dev/github/pr/llvm/llvm-project/142898?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142905 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] DAG: Move soft float predicate management into RuntimeLibcalls (PR #142905)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/142905 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -2627,6 +2629,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) ritter-x2a wrote: There is an `assert(PtrVT == IntVT)` above and a similar assert in `SelectionDAG::getNode()` that rules that out (the reasoning for adding the assert in `getNode` was [here](https://github.com/llvm/llvm-project/pull/140017#issuecomment-2893168440)). We can add this condition here as well to emphasize it even more, but to make the combines truly safe against problems when pointer and index type mismatches are allowed, we'd also need to handle, e.g., cases where the types of `Y` and `Z` in the reassociation below don't match (and there are probably more cases where explicit handling would be required). https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CI] Migrate to runtimes build (PR #142696)
@@ -184,24 +199,36 @@ def _compute_project_check_targets(projects_to_test: Set[str]) -> Set[str]: return check_targets -def _compute_runtimes_to_test(projects_to_test: Set[str]) -> Set[str]: +def _compute_runtimes_to_test(modified_projects: Set[str], platform: str) -> Set[str]: runtimes_to_test = set() -for project_to_test in projects_to_test: -if project_to_test in DEPENDENT_RUNTIMES_TO_TEST: - runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_TEST[project_to_test]) -if project_to_test in DEPENDENT_RUNTIMES_TO_BUILD: - runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_BUILD[project_to_test]) -return runtimes_to_test +for modified_project in modified_projects: +if modified_project not in DEPENDENT_RUNTIMES_TO_TEST: +continue +runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_TEST[modified_project]) boomanaiden154 wrote: Ack. I'll look at doing this in a separate patch. https://github.com/llvm/llvm-project/pull/142696 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Baseline fneg-fabs.bf16.ll tests. NFC. (PR #142910)
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/142910 None >From 321eb42ae21d0d3156fb5ef15f5b336551a20c5b Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 4 Jun 2025 23:46:28 -0700 Subject: [PATCH] [AMDGPU] Baseline fneg-fabs.bf16.ll tests. NFC. --- llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll | 1223 1 file changed, 1223 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll new file mode 100644 index 0..243469d39cc11 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -0,0 +1,1223 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=CIVI,CI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga < %s | FileCheck --check-prefixes=CIVI,VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s + +define amdgpu_kernel void @fneg_fabs_fadd_bf16(ptr addrspace(1) %out, bfloat %x, bfloat %y) { +; CI-LABEL: fneg_fabs_fadd_bf16: +; CI: ; %bb.0: +; CI-NEXT:s_load_dword s2, s[8:9], 0x2 +; CI-NEXT:s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT:s_add_i32 s12, s12, s17 +; CI-NEXT:s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT:s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT:s_waitcnt lgkmcnt(0) +; CI-NEXT:s_and_b32 s3, s2, 0x7fff +; CI-NEXT:s_lshl_b32 s3, s3, 16 +; CI-NEXT:s_and_b32 s2, s2, 0x +; CI-NEXT:v_mov_b32_e32 v0, s3 +; CI-NEXT:v_sub_f32_e32 v0, s2, v0 +; CI-NEXT:v_lshrrev_b32_e32 v2, 16, v0 +; CI-NEXT:v_mov_b32_e32 v0, s0 +; CI-NEXT:v_mov_b32_e32 v1, s1 +; CI-NEXT:flat_store_short v[0:1], v2 +; CI-NEXT:s_endpgm +; +; VI-LABEL: fneg_fabs_fadd_bf16: +; VI: ; %bb.0: +; VI-NEXT:s_load_dword s2, s[8:9], 0x8 +; VI-NEXT:s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT:s_add_i32 s12, s12, s17 +; VI-NEXT:s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT:s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT:s_waitcnt lgkmcnt(0) +; VI-NEXT:s_and_b32 s3, s2, 0x7fff +; VI-NEXT:s_lshl_b32 s3, s3, 16 +; VI-NEXT:s_and_b32 s2, s2, 0x +; VI-NEXT:v_mov_b32_e32 v0, s3 +; VI-NEXT:v_sub_f32_e32 v0, s2, v0 +; VI-NEXT:v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT:v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT:v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT:v_or_b32_e32 v2, 0x40, v0 +; VI-NEXT:v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT:v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT:v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT:v_mov_b32_e32 v0, s0 +; VI-NEXT:v_mov_b32_e32 v1, s1 +; VI-NEXT:flat_store_short v[0:1], v2 +; VI-NEXT:s_endpgm +; +; GFX9-LABEL: fneg_fabs_fadd_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT:s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT:v_mov_b32_e32 v0, 0 +; GFX9-NEXT:s_waitcnt lgkmcnt(0) +; GFX9-NEXT:s_and_b32 s3, s2, 0x7fff +; GFX9-NEXT:s_lshl_b32 s3, s3, 16 +; GFX9-NEXT:s_and_b32 s2, s2, 0x +; GFX9-NEXT:v_mov_b32_e32 v1, s3 +; GFX9-NEXT:v_sub_f32_e32 v1, s2, v1 +; GFX9-NEXT:v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT:v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT:v_or_b32_e32 v3, 0x40, v1 +; GFX9-NEXT:v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT:v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT:v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT:global_store_short_d16_hi v0, v1, s[0:1] +; GFX9-NEXT:s_endpgm +; +; GFX11-TRUE16-LABEL: fneg_fabs_fadd_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT:s_load_b32 s0, s[4:5], 0x8 +; GFX11-TRUE16-NEXT:s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT:s_mov_b32 s1, s0 +; GFX11-TRUE16-NEXT:s_and_b32 s0, s0, 0x +; GFX11-TRUE16-NEXT:s_and_b32 s1, s1, 0x7fff +; GFX11-TRUE16-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT:s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT:v_sub_f32_e64 v0, s0, s1 +; GFX11-TRUE16-NEXT:s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-TRUE16-NEXT:s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT:v_bfe_u32 v1, v0, 16, 1 +; GFX11-TRUE16-NEXT:v_or_b32_e32 v2, 0x40, v0 +; GFX11-TRUE16-NEXT:v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT:v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT:s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT:v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT:v_dual_m
[llvm-branch-commits] [llvm] [AMDGPU] Patterns for <2 x bfloat> fneg (fabs) (PR #142911)
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/142911 None >From 44a9017e98eff94456889a528a166d6aabca842d Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 4 Jun 2025 23:49:43 -0700 Subject: [PATCH] [AMDGPU] Patterns for <2 x bfloat> fneg (fabs) --- llvm/lib/Target/AMDGPU/SIInstructions.td | 11 +++ llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll | 38 +- 2 files changed, 21 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index a0285e3512a08..360fd05cb3d96 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1840,22 +1840,21 @@ def : GCNPat < (UniformUnaryFrag (v2fp16vt SReg_32:$src)), (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff))) >; -} // This is really (fneg (fabs v2f16:$src)) // // fabs is not reported as free because there is modifier for it in // VOP3P instructions, so it is turned into the bit op. def : GCNPat < - (UniformUnaryFrag (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff, + (UniformUnaryFrag (v2fp16vt (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff, (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; def : GCNPat < - (UniformUnaryFrag (v2f16 (fabs SReg_32:$src))), + (UniformUnaryFrag (v2fp16vt (fabs SReg_32:$src))), (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; - +} // COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead // of the real value. @@ -1986,12 +1985,12 @@ def : GCNPat < (fabs (v2fp16vt VGPR_32:$src)), (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src) >; -} def : GCNPat < - (fneg (v2f16 (fabs VGPR_32:$src))), + (fneg (v2fp16vt (fabs VGPR_32:$src))), (V_OR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) >; +} def : GCNPat < (fabs (f64 VReg_64:$src)), diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll index 243469d39cc11..d189b6d4c1e83 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -523,8 +523,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; VI-NEXT:v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT:v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT:v_alignbit_b32 v0, v1, v0, 16 -; VI-NEXT:v_and_b32_e32 v0, 0x7fff7fff, v0 -; VI-NEXT:v_xor_b32_e32 v2, 0x80008000, v0 +; VI-NEXT:v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT:v_mov_b32_e32 v0, s0 ; VI-NEXT:v_mov_b32_e32 v1, s1 ; VI-NEXT:flat_store_dword v[0:1], v2 @@ -556,8 +555,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; GFX9-NEXT:v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT:v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT:v_lshl_or_b32 v1, v1, 16, v2 -; GFX9-NEXT:v_and_b32_e32 v1, 0x7fff7fff, v1 -; GFX9-NEXT:v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-NEXT:v_or_b32_e32 v1, 0x80008000, v1 ; GFX9-NEXT:global_store_dword v0, v1, s[0:1] ; GFX9-NEXT:s_endpgm ; @@ -590,9 +588,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT:v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT:v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7fff7fff, v0 -; GFX11-NEXT:v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11-NEXT:v_mov_b32_e32 v1, 0 +; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT:v_or_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT:s_waitcnt lgkmcnt(0) ; GFX11-NEXT:global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT:s_endpgm @@ -634,8 +632,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x ; VI-NEXT:s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT:s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT:s_waitcnt lgkmcnt(0) -; VI-NEXT:s_and_b32 s2, s2, 0x7fff7fff -; VI-NEXT:s_xor_b32 s2, s2, 0x80008000 +; VI-NEXT:s_or_b32 s2, s2, 0x80008000 ; VI-NEXT:v_mov_b32_e32 v0, s0 ; VI-NEXT:v_mov_b32_e32 v1, s1 ; VI-NEXT:v_mov_b32_e32 v2, s2 @@ -648,8 +645,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x ; GFX9-NEXT:s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT:v_mov_b32_e32 v0, 0 ; GFX9-NEXT:s_waitcnt lgkmcnt(0) -; GFX9-NEXT:s_and_b32 s2, s2, 0x7fff7fff -; GFX9-NEXT:s_xor_b32 s2, s2, 0x80008000 +; GFX9-NEXT:s_or_b32 s2, s2, 0x80008000 ; GFX9-NEXT:v_mov_b32_e32 v1, s2 ; GFX9-NEXT:global_store_dword v0, v1, s[0:1] ; GFX9-NEXT:s_endpgm @@ -660,9 +656,8 @@ define amdgpu_kern
[llvm-branch-commits] [llvm] [AMDGPU] Patterns for <2 x bfloat> fneg (fabs) (PR #142911)
rampitec wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142911?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142911** https://app.graphite.dev/github/pr/llvm/llvm-project/142911?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142911?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142910** https://app.graphite.dev/github/pr/llvm/llvm-project/142910?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142908** https://app.graphite.dev/github/pr/llvm/llvm-project/142908?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142907** https://app.graphite.dev/github/pr/llvm/llvm-project/142907?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142911 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Baseline fneg-fabs.bf16.ll tests. NFC. (PR #142910)
rampitec wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142910?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142911** https://app.graphite.dev/github/pr/llvm/llvm-project/142911?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142910** https://app.graphite.dev/github/pr/llvm/llvm-project/142910?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142910?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142908** https://app.graphite.dev/github/pr/llvm/llvm-project/142908?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142907** https://app.graphite.dev/github/pr/llvm/llvm-project/142907?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142910 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Baseline fneg-fabs.bf16.ll tests. NFC. (PR #142910)
https://github.com/rampitec ready_for_review https://github.com/llvm/llvm-project/pull/142910 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Baseline fneg-fabs.bf16.ll tests. NFC. (PR #142910)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) Changes --- Patch is 52.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142910.diff 1 Files Affected: - (added) llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll (+1223) ``diff diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll new file mode 100644 index 0..243469d39cc11 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -0,0 +1,1223 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=CIVI,CI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga < %s | FileCheck --check-prefixes=CIVI,VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s + +define amdgpu_kernel void @fneg_fabs_fadd_bf16(ptr addrspace(1) %out, bfloat %x, bfloat %y) { +; CI-LABEL: fneg_fabs_fadd_bf16: +; CI: ; %bb.0: +; CI-NEXT:s_load_dword s2, s[8:9], 0x2 +; CI-NEXT:s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT:s_add_i32 s12, s12, s17 +; CI-NEXT:s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT:s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT:s_waitcnt lgkmcnt(0) +; CI-NEXT:s_and_b32 s3, s2, 0x7fff +; CI-NEXT:s_lshl_b32 s3, s3, 16 +; CI-NEXT:s_and_b32 s2, s2, 0x +; CI-NEXT:v_mov_b32_e32 v0, s3 +; CI-NEXT:v_sub_f32_e32 v0, s2, v0 +; CI-NEXT:v_lshrrev_b32_e32 v2, 16, v0 +; CI-NEXT:v_mov_b32_e32 v0, s0 +; CI-NEXT:v_mov_b32_e32 v1, s1 +; CI-NEXT:flat_store_short v[0:1], v2 +; CI-NEXT:s_endpgm +; +; VI-LABEL: fneg_fabs_fadd_bf16: +; VI: ; %bb.0: +; VI-NEXT:s_load_dword s2, s[8:9], 0x8 +; VI-NEXT:s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT:s_add_i32 s12, s12, s17 +; VI-NEXT:s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT:s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT:s_waitcnt lgkmcnt(0) +; VI-NEXT:s_and_b32 s3, s2, 0x7fff +; VI-NEXT:s_lshl_b32 s3, s3, 16 +; VI-NEXT:s_and_b32 s2, s2, 0x +; VI-NEXT:v_mov_b32_e32 v0, s3 +; VI-NEXT:v_sub_f32_e32 v0, s2, v0 +; VI-NEXT:v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT:v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT:v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT:v_or_b32_e32 v2, 0x40, v0 +; VI-NEXT:v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT:v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT:v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT:v_mov_b32_e32 v0, s0 +; VI-NEXT:v_mov_b32_e32 v1, s1 +; VI-NEXT:flat_store_short v[0:1], v2 +; VI-NEXT:s_endpgm +; +; GFX9-LABEL: fneg_fabs_fadd_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT:s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT:v_mov_b32_e32 v0, 0 +; GFX9-NEXT:s_waitcnt lgkmcnt(0) +; GFX9-NEXT:s_and_b32 s3, s2, 0x7fff +; GFX9-NEXT:s_lshl_b32 s3, s3, 16 +; GFX9-NEXT:s_and_b32 s2, s2, 0x +; GFX9-NEXT:v_mov_b32_e32 v1, s3 +; GFX9-NEXT:v_sub_f32_e32 v1, s2, v1 +; GFX9-NEXT:v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT:v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT:v_or_b32_e32 v3, 0x40, v1 +; GFX9-NEXT:v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT:v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT:v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT:global_store_short_d16_hi v0, v1, s[0:1] +; GFX9-NEXT:s_endpgm +; +; GFX11-TRUE16-LABEL: fneg_fabs_fadd_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT:s_load_b32 s0, s[4:5], 0x8 +; GFX11-TRUE16-NEXT:s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT:s_mov_b32 s1, s0 +; GFX11-TRUE16-NEXT:s_and_b32 s0, s0, 0x +; GFX11-TRUE16-NEXT:s_and_b32 s1, s1, 0x7fff +; GFX11-TRUE16-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT:s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT:v_sub_f32_e64 v0, s0, s1 +; GFX11-TRUE16-NEXT:s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-TRUE16-NEXT:s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT:v_bfe_u32 v1, v0, 16, 1 +; GFX11-TRUE16-NEXT:v_or_b32_e32 v2, 0x40, v0 +; GFX11-TRUE16-NEXT:v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT:v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT:s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT:v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT:v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, v1, v2 +; GFX11-TRUE16-NEXT:s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT:global_store_d16_hi_b1
[llvm-branch-commits] [llvm] [AMDGPU] Patterns for <2 x bfloat> fneg (fabs) (PR #142911)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) Changes --- Full diff: https://github.com/llvm/llvm-project/pull/142911.diff 2 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+5-6) - (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll (+16-22) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index a0285e3512a08..360fd05cb3d96 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1840,22 +1840,21 @@ def : GCNPat < (UniformUnaryFrag (v2fp16vt SReg_32:$src)), (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff))) >; -} // This is really (fneg (fabs v2f16:$src)) // // fabs is not reported as free because there is modifier for it in // VOP3P instructions, so it is turned into the bit op. def : GCNPat < - (UniformUnaryFrag (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff, + (UniformUnaryFrag (v2fp16vt (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff, (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; def : GCNPat < - (UniformUnaryFrag (v2f16 (fabs SReg_32:$src))), + (UniformUnaryFrag (v2fp16vt (fabs SReg_32:$src))), (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; - +} // COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead // of the real value. @@ -1986,12 +1985,12 @@ def : GCNPat < (fabs (v2fp16vt VGPR_32:$src)), (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src) >; -} def : GCNPat < - (fneg (v2f16 (fabs VGPR_32:$src))), + (fneg (v2fp16vt (fabs VGPR_32:$src))), (V_OR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) >; +} def : GCNPat < (fabs (f64 VReg_64:$src)), diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll index 243469d39cc11..d189b6d4c1e83 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -523,8 +523,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; VI-NEXT:v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT:v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT:v_alignbit_b32 v0, v1, v0, 16 -; VI-NEXT:v_and_b32_e32 v0, 0x7fff7fff, v0 -; VI-NEXT:v_xor_b32_e32 v2, 0x80008000, v0 +; VI-NEXT:v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT:v_mov_b32_e32 v0, s0 ; VI-NEXT:v_mov_b32_e32 v1, s1 ; VI-NEXT:flat_store_dword v[0:1], v2 @@ -556,8 +555,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; GFX9-NEXT:v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT:v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT:v_lshl_or_b32 v1, v1, 16, v2 -; GFX9-NEXT:v_and_b32_e32 v1, 0x7fff7fff, v1 -; GFX9-NEXT:v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-NEXT:v_or_b32_e32 v1, 0x80008000, v1 ; GFX9-NEXT:global_store_dword v0, v1, s[0:1] ; GFX9-NEXT:s_endpgm ; @@ -590,9 +588,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT:v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT:v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7fff7fff, v0 -; GFX11-NEXT:v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11-NEXT:v_mov_b32_e32 v1, 0 +; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT:v_or_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT:s_waitcnt lgkmcnt(0) ; GFX11-NEXT:global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT:s_endpgm @@ -634,8 +632,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x ; VI-NEXT:s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT:s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT:s_waitcnt lgkmcnt(0) -; VI-NEXT:s_and_b32 s2, s2, 0x7fff7fff -; VI-NEXT:s_xor_b32 s2, s2, 0x80008000 +; VI-NEXT:s_or_b32 s2, s2, 0x80008000 ; VI-NEXT:v_mov_b32_e32 v0, s0 ; VI-NEXT:v_mov_b32_e32 v1, s1 ; VI-NEXT:v_mov_b32_e32 v2, s2 @@ -648,8 +645,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x ; GFX9-NEXT:s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT:v_mov_b32_e32 v0, 0 ; GFX9-NEXT:s_waitcnt lgkmcnt(0) -; GFX9-NEXT:s_and_b32 s2, s2, 0x7fff7fff -; GFX9-NEXT:s_xor_b32 s2, s2, 0x80008000 +; GFX9-NEXT:s_or_b32 s2, s2, 0x80008000 ; GFX9-NEXT:v_mov_b32_e32 v1, s2 ; GFX9-NEXT:global_store_dword v0, v1, s[0:1] ; GFX9-NEXT:s_endpgm @@ -660,9 +656,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x ; GFX11-NEXT:s_load_b32 s2, s[4:5], 0x8 ; GFX11-NEXT:s_load_b64
[llvm-branch-commits] [llvm] [CI] Migrate to runtimes build (PR #142696)
https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/142696 >From 360e723b51ee201603f72b56859cd7c6d6faec24 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 5 Jun 2025 06:51:37 + Subject: [PATCH] feedback Created using spr 1.3.4 --- .ci/compute_projects.py | 17 + 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py index b12b729eadd3f..8134e1e2c29fb 100644 --- a/.ci/compute_projects.py +++ b/.ci/compute_projects.py @@ -145,22 +145,15 @@ def _add_dependencies(projects: Set[str], runtimes: Set[str]) -> Set[str]: def _exclude_projects(current_projects: Set[str], platform: str) -> Set[str]: -new_project_set = set(current_projects) if platform == "Linux": -for to_exclude in EXCLUDE_LINUX: -if to_exclude in new_project_set: -new_project_set.remove(to_exclude) +to_exclude = EXCLUDE_LINUX elif platform == "Windows": -for to_exclude in EXCLUDE_WINDOWS: -if to_exclude in new_project_set: -new_project_set.remove(to_exclude) +to_exclude = EXCLUDE_WINDOWS elif platform == "Darwin": -for to_exclude in EXCLUDE_MAC: -if to_exclude in new_project_set: -new_project_set.remove(to_exclude) +to_exclude = EXCLUDE_MAC else: -raise ValueError("Unexpected platform.") -return new_project_set +raise ValueError(f"Unexpected platform: {platform}") +return current_projects.difference(to_exclude) def _compute_projects_to_test(modified_projects: Set[str], platform: str) -> Set[str]: ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Patterns for <2 x bfloat> fneg (fabs) (PR #142911)
https://github.com/rampitec ready_for_review https://github.com/llvm/llvm-project/pull/142911 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Make <2 x bfloat> fabs legal (PR #142908)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/142908 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CVP] Keep `ReachableCaseCount` in sync with range of condition (#142302) (PR #142730)
llvmbot wrote: @llvm/pr-subscribers-llvm-transforms Author: Yingwei Zheng (dtcxzyw) Changes Backport https://github.com/llvm/llvm-project/commit/0f7cc4132b62e0ecdbd3193e954b745c5f492e90. https://github.com/llvm/llvm-project/pull/79993 assumes that a reachable case must be contained by `CR`. However, it doesn't hold for some edge cases. This patch adds additional checks to ensure `ReachableCaseCount` is correct. Note: Similar optimization in SCCP isn't affected by this bug because it uses `CR` to compute `ReachableCaseCount`. Closes https://github.com/llvm/llvm-project/issues/142286. --- Full diff: https://github.com/llvm/llvm-project/pull/142730.diff 2 Files Affected: - (modified) llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp (+35-24) - (modified) llvm/test/Transforms/CorrelatedValuePropagation/switch.ll (+36) ``diff diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 8e74b8645fad9..86c4170b9a977 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -370,15 +370,30 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, { // Scope for SwitchInstProfUpdateWrapper. It must not live during // ConstantFoldTerminator() as the underlying SwitchInst can be changed. SwitchInstProfUpdateWrapper SI(*I); +ConstantRange CR = +LVI->getConstantRangeAtUse(I->getOperandUse(0), /*UndefAllowed=*/false); unsigned ReachableCaseCount = 0; for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) { ConstantInt *Case = CI->getCaseValue(); - auto *Res = dyn_cast_or_null( - LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I, - /* UseBlockValue */ true)); + std::optional Predicate = std::nullopt; + if (!CR.contains(Case->getValue())) +Predicate = false; + else if (CR.isSingleElement() && + *CR.getSingleElement() == Case->getValue()) +Predicate = true; + if (!Predicate) { +// Handle missing cases, e.g., the range has a hole. +auto *Res = dyn_cast_or_null( +LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I, +/* UseBlockValue=*/true)); +if (Res && Res->isZero()) + Predicate = false; +else if (Res && Res->isOne()) + Predicate = true; + } - if (Res && Res->isZero()) { + if (Predicate && !*Predicate) { // This case never fires - remove it. BasicBlock *Succ = CI->getCaseSuccessor(); Succ->removePredecessor(BB); @@ -395,7 +410,7 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, DTU.applyUpdatesPermissive({{DominatorTree::Delete, BB, Succ}}); continue; } - if (Res && Res->isOne()) { + if (Predicate && *Predicate) { // This case always fires. Arrange for the switch to be turned into an // unconditional branch by replacing the switch condition with the case // value. @@ -410,28 +425,24 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, ++ReachableCaseCount; } -BasicBlock *DefaultDest = SI->getDefaultDest(); -if (ReachableCaseCount > 1 && -!isa(DefaultDest->getFirstNonPHIOrDbg())) { - ConstantRange CR = LVI->getConstantRangeAtUse(I->getOperandUse(0), -/*UndefAllowed*/ false); - // The default dest is unreachable if all cases are covered. - if (!CR.isSizeLargerThan(ReachableCaseCount)) { -BasicBlock *NewUnreachableBB = -BasicBlock::Create(BB->getContext(), "default.unreachable", - BB->getParent(), DefaultDest); -new UnreachableInst(BB->getContext(), NewUnreachableBB); +// The default dest is unreachable if all cases are covered. +if (!SI->defaultDestUndefined() && +!CR.isSizeLargerThan(ReachableCaseCount)) { + BasicBlock *DefaultDest = SI->getDefaultDest(); + BasicBlock *NewUnreachableBB = + BasicBlock::Create(BB->getContext(), "default.unreachable", + BB->getParent(), DefaultDest); + new UnreachableInst(BB->getContext(), NewUnreachableBB); -DefaultDest->removePredecessor(BB); -SI->setDefaultDest(NewUnreachableBB); + DefaultDest->removePredecessor(BB); + SI->setDefaultDest(NewUnreachableBB); -if (SuccessorsCount[DefaultDest] == 1) - DTU.applyUpdates({{DominatorTree::Delete, BB, DefaultDest}}); -DTU.applyUpdates({{DominatorTree::Insert, BB, NewUnreachableBB}}); + if (SuccessorsCount[DefaultDest] == 1) +DTU.applyUpdates({{DominatorTree::Delete, BB, DefaultDest}}); + DTU.applyUpdates({{DominatorTree::Insert, BB, NewUnreachableBB}}); -++NumDeadCases; -
[llvm-branch-commits] [llvm] [CVP] Keep `ReachableCaseCount` in sync with range of condition (#142302) (PR #142730)
https://github.com/dtcxzyw milestoned https://github.com/llvm/llvm-project/pull/142730 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
https://github.com/ritter-x2a created https://github.com/llvm/llvm-project/pull/142739 This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one that is closely connected. The generic DAG combine is based on a part of PR #105669 by @rgwott, which was adapted from work by @jrtc27, @arichardson, @davidchisnall in the CHERI/Morello LLVM tree. I added some parts and removed several disjuncts from the reassociation condition: - `isNullConstant(X)`, since there are address spaces where 0 is a perfectly normal value that shouldn't be treated specially, - `(YIsConstant && ZOneUse)` and `(N0OneUse && ZOneUse && !ZIsConstant)`, since they cause regressions in AMDGPU. For SWDEV-516125. >From 1afe2b10cb3781c57adb2eec584b7fc07c073cf8 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 03:32:32 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one that is closely connected. The generic DAG combine is based on a part of PR #105669 by @rgwott, which was adapted from work by @jrtc27, @arichardson, @davidchisnall in the CHERI/Morello LLVM tree. I added some parts and removed several disjuncts from the reassociation condition: - `isNullConstant(X)`, since there are address spaces where 0 is a perfectly normal value that shouldn't be treated specially, - `(YIsConstant && ZOneUse)` and `(N0OneUse && ZOneUse && !ZIsConstant)`, since they cause regressions in AMDGPU. For SWDEV-516125. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 92 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 49 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + .../AMDGPU/ptradd-sdag-optimizations.ll | 193 ++ 4 files changed, 201 insertions(+), 134 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 9e418329d15be..e45134a2e5548 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -419,6 +419,7 @@ namespace { SDValue visitADDLike(SDNode *N); SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); +SDValue visitPTRADD(SDNode *N); SDValue visitSUB(SDNode *N); SDValue visitADDSAT(SDNode *N); SDValue visitSUBSAT(SDNode *N); @@ -1138,7 +1139,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, return true; } - if (Opc != ISD::ADD) + if (Opc != ISD::ADD && Opc != ISD::PTRADD) return false; auto *C2 = dyn_cast(N1); @@ -1858,6 +1859,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::TokenFactor:return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD:return visitADD(N); + case ISD::PTRADD: return visitPTRADD(N); case ISD::SUB:return visitSUB(N); case ISD::SADDSAT: case ISD::UADDSAT:return visitADDSAT(N); @@ -2623,6 +2625,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) +return N1; + + if (N0.getOpcode() == ISD::PTRADD && + !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) { +SDValue X = N0.getOperand(0); +SDValue Y = N0.getOperand(1); +SDValue Z = N1; +bool N0OneUse = N0.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + +// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if: +// * y is a constant and (ptradd x, y) has one use; or +// * y and z are both constants. +if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && + N0->getFlags().hasNoUnsignedWrap()) +Flags |= SDNodeFlags::NoUnsignedWrap; + SDVa
[llvm-branch-commits] [llvm] [AMDGPU] New RegBankSelect: Add rules for `G_PTRTOINT` and `G_INTTOPTR` (PR #142604)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/142604 >From 9fd34f632f194a025669b2c2c0f83d19fb48b00c Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 3 Jun 2025 15:08:06 +0200 Subject: [PATCH 1/3] [AMDGPU] New RegBankSelect: Add rules for `G_PTRTOINT` and `G_INTTOPTR` --- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 16 ++- .../GlobalISel/regbankselect-inttoptr.mir | 98 +++ .../GlobalISel/regbankselect-ptrtoint.mir | 98 +++ 3 files changed, 211 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 5402129e41887..61d7d084b21da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -718,7 +718,21 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}) .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}}); - addRulesForGOpcs({G_INTTOPTR}).Any({{UniP4}, {{SgprP4}, {Sgpr64}}}); + addRulesForGOpcs({G_INTTOPTR}) + .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}}) + .Any({{DivPtr32}, {{VgprPtr32}, {Vgpr32}}}) + .Any({{UniPtr64}, {{SgprPtr64}, {Sgpr64}}}) + .Any({{DivPtr64}, {{VgprPtr64}, {Vgpr64}}}) + .Any({{UniPtr128}, {{SgprPtr128}, {Sgpr128}}}) + .Any({{DivPtr128}, {{VgprPtr128}, {Vgpr128}}}); + + addRulesForGOpcs({G_PTRTOINT}) + .Any({{UniS32}, {{Sgpr32}, {SgprPtr32}}}) + .Any({{DivS32}, {{Vgpr32}, {VgprPtr32}}}) + .Any({{UniS64}, {{Sgpr64}, {SgprPtr64}}}) + .Any({{DivS64}, {{Vgpr64}, {VgprPtr64}}}) + .Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}}) + .Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}}); addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir index 42600d7d0dd7a..d9b1b6e20089b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir @@ -2,6 +2,8 @@ # RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s # RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s --check-prefix=NEW_RBS + --- name: inttoptr_s_p0 legalized: true @@ -14,6 +16,12 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p0) = G_INTTOPTR [[COPY]](s64) +; +; NEW_RBS-LABEL: name: inttoptr_s_p0 +; NEW_RBS: liveins: $sgpr0_sgpr1 +; NEW_RBS-NEXT: {{ $}} +; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 +; NEW_RBS-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p0) = G_INTTOPTR [[COPY]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 %1:_(p0) = G_INTTOPTR %0 ... @@ -30,6 +38,12 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p0) = G_INTTOPTR [[COPY]](s64) +; +; NEW_RBS-LABEL: name: inttoptr_v_p0 +; NEW_RBS: liveins: $vgpr0_vgpr1 +; NEW_RBS-NEXT: {{ $}} +; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 +; NEW_RBS-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p0) = G_INTTOPTR [[COPY]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(p0) = G_INTTOPTR %0 ... @@ -46,6 +60,12 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p1) = G_INTTOPTR [[COPY]](s64) +; +; NEW_RBS-LABEL: name: inttoptr_s_p1 +; NEW_RBS: liveins: $sgpr0_sgpr1 +; NEW_RBS-NEXT: {{ $}} +; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 +; NEW_RBS-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p1) = G_INTTOPTR [[COPY]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 %1:_(p1) = G_INTTOPTR %0 ... @@ -62,6 +82,12 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p1) = G_INTTOPTR [[COPY]](s64) +; +; NEW_RBS-LABEL: name: inttoptr_v_p1 +; NEW_RBS: liveins: $vgpr0_vgpr1 +; NEW_RBS-NEXT: {{ $}} +; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 +; NEW_RBS-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p1) = G_INTTOPTR [[COPY]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(p1) = G_INTTOPTR %0 ... @@ -78,6 +104,12 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p2) = G_INTTOPTR [[COPY]](s32) +; +
[llvm-branch-commits] [llvm] [AMDGPU] New RegBanKSelect: Add S128 types (PR #142601)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/142601 >From 96669eee5e756faed679480521faafd9f1bad9d1 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 3 Jun 2025 13:27:55 +0200 Subject: [PATCH] [AMDGPU] New RegBanKSelect: Add S128 types --- llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 9 + llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp | 6 ++ llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h| 5 + 3 files changed, 20 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 7ff822c6f6580..89af982636590 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -556,6 +556,9 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case Sgpr64: case Vgpr64: return LLT::scalar(64); + case Sgpr128: + case Vgpr128: +return LLT::scalar(128); case VgprP0: return LLT::pointer(0, 64); case SgprP1: @@ -646,6 +649,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case Sgpr16: case Sgpr32: case Sgpr64: + case Sgpr128: case SgprP1: case SgprP3: case SgprP4: @@ -678,6 +682,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case Vgpr16: case Vgpr32: case Vgpr64: + case Vgpr128: case VgprP0: case VgprP1: case VgprP3: @@ -718,6 +723,7 @@ void RegBankLegalizeHelper::applyMappingDst( case Sgpr16: case Sgpr32: case Sgpr64: +case Sgpr128: case SgprP1: case SgprP3: case SgprP4: @@ -728,6 +734,7 @@ void RegBankLegalizeHelper::applyMappingDst( case Vgpr16: case Vgpr32: case Vgpr64: +case Vgpr128: case VgprP0: case VgprP1: case VgprP3: @@ -839,6 +846,7 @@ void RegBankLegalizeHelper::applyMappingSrc( case Sgpr16: case Sgpr32: case Sgpr64: +case Sgpr128: case SgprP1: case SgprP3: case SgprP4: @@ -865,6 +873,7 @@ void RegBankLegalizeHelper::applyMappingSrc( case Vgpr16: case Vgpr32: case Vgpr64: +case Vgpr128: case VgprP0: case VgprP1: case VgprP3: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 5e21f44f7d545..672fc5b79abc2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -50,6 +50,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::scalar(32); case S64: return MRI.getType(Reg) == LLT::scalar(64); + case S128: +return MRI.getType(Reg) == LLT::scalar(128); case P0: return MRI.getType(Reg) == LLT::pointer(0, 64); case P1: @@ -84,6 +86,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg); case UniS64: return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg); + case UniS128: +return MRI.getType(Reg) == LLT::scalar(128) && MUI.isUniform(Reg); case UniP0: return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg); case UniP1: @@ -116,6 +120,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg); case DivS64: return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg); + case DivS128: +return MRI.getType(Reg) == LLT::scalar(128) && MUI.isDivergent(Reg); case DivP0: return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg); case DivP1: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index bddfb8dd1913f..30b900d871f3c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -39,16 +39,19 @@ enum UniformityLLTOpPredicateID { S16, S32, S64, + S128, UniS1, UniS16, UniS32, UniS64, + UniS128, DivS1, DivS16, DivS32, DivS64, + DivS128, // pointers P0, @@ -117,6 +120,7 @@ enum RegBankLLTMappingApplyID { Sgpr16, Sgpr32, Sgpr64, + Sgpr128, SgprP1, SgprP3, SgprP4, @@ -135,6 +139,7 @@ enum RegBankLLTMappingApplyID { Vgpr16, Vgpr32, Vgpr64, + Vgpr128, VgprP0, VgprP1, VgprP3, ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Improve test coverage for G_INTTOPTR and G_PTRTOINT (PR #142603)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/142603 >From 3a47927dfaaa98cb0d2a336bfa416d2eb28e294d Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 3 Jun 2025 15:03:01 +0200 Subject: [PATCH] [AMDGPU] Improve test coverage for G_INTTOPTR and G_PTRTOINT Test P0 through P6 + P8 for both S/VGPRs. --- .../GlobalISel/regbankselect-inttoptr.mir | 236 +- .../GlobalISel/regbankselect-ptrtoint.mir | 232 - 2 files changed, 458 insertions(+), 10 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir index 053aede615f86..42600d7d0dd7a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir @@ -3,29 +3,29 @@ # RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s --- -name: inttoptr_s +name: inttoptr_s_p0 legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 -; CHECK-LABEL: name: inttoptr_s +; CHECK-LABEL: name: inttoptr_s_p0 ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 -; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p4) = G_INTTOPTR [[COPY]](s64) +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p0) = G_INTTOPTR [[COPY]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 -%1:_(p4) = G_INTTOPTR %0 +%1:_(p0) = G_INTTOPTR %0 ... --- -name: inttoptr_v +name: inttoptr_v_p0 legalized: true body: | bb.0: liveins: $vgpr0_vgpr1 -; CHECK-LABEL: name: inttoptr_v +; CHECK-LABEL: name: inttoptr_v_p0 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 @@ -33,3 +33,227 @@ body: | %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(p0) = G_INTTOPTR %0 ... + +--- +name: inttoptr_s_p1 +legalized: true + +body: | + bb.0: +liveins: $sgpr0_sgpr1 +; CHECK-LABEL: name: inttoptr_s_p1 +; CHECK: liveins: $sgpr0_sgpr1 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p1) = G_INTTOPTR [[COPY]](s64) +%0:_(s64) = COPY $sgpr0_sgpr1 +%1:_(p1) = G_INTTOPTR %0 +... + +--- +name: inttoptr_v_p1 +legalized: true + +body: | + bb.0: +liveins: $vgpr0_vgpr1 +; CHECK-LABEL: name: inttoptr_v_p1 +; CHECK: liveins: $vgpr0_vgpr1 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p1) = G_INTTOPTR [[COPY]](s64) +%0:_(s64) = COPY $vgpr0_vgpr1 +%1:_(p1) = G_INTTOPTR %0 +... + +--- +name: inttoptr_s_p2 +legalized: true + +body: | + bb.0: +liveins: $sgpr0 +; CHECK-LABEL: name: inttoptr_s_p2 +; CHECK: liveins: $sgpr0 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p2) = G_INTTOPTR [[COPY]](s32) +%0:_(s32) = COPY $sgpr0 +%1:_(p2) = G_INTTOPTR %0 +... + +--- +name: inttoptr_v_p2 +legalized: true + +body: | + bb.0: +liveins: $vgpr0 +; CHECK-LABEL: name: inttoptr_v_p2 +; CHECK: liveins: $vgpr0 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p2) = G_INTTOPTR [[COPY]](s32) +%0:_(s32) = COPY $vgpr0 +%1:_(p2) = G_INTTOPTR %0 +... + +--- +name: inttoptr_s_p3 +legalized: true + +body: | + bb.0: +liveins: $sgpr0 +; CHECK-LABEL: name: inttoptr_s_p3 +; CHECK: liveins: $sgpr0 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p3) = G_INTTOPTR [[COPY]](s32) +%0:_(s32) = COPY $sgpr0 +%1:_(p3) = G_INTTOPTR %0 +... + +--- +name: inttoptr_v_p3 +legalized: true + +body: | + bb.0: +liveins: $vgpr0 +; CHECK-LABEL: name: inttoptr_v_p3 +; CHECK: liveins: $vgpr0 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p3) = G_INTTOPTR [[COPY]](s32) +%0:_(s32) = COPY $vgpr0 +%1:_(p3) = G_INTTOPTR %0 +... + +--- +name: inttoptr_s_p4 +legalized: true + +body: | + bb.0: +liveins: $sgpr0_sgpr1 +; CHECK-LABEL: name: inttoptr_s_p4 +; CHECK: liveins: $sgpr0_sgpr1 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p4) = G_INTTOPTR [[COPY]](s64) +%0:_(s64) = COPY $sgpr0_sgpr1 +%1:_(p4) = G_INTTOPTR %0 +... + +--- +name: inttoptr_v_p4 +legalized: true + +body: | + bb.0: +liveins: $vgpr0_vgpr1 +; CHECK-LABEL: name: inttoptr_v_p4 +; CHECK: liveins: $vgpr0_vgpr1 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 +;
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Fabian Ritter (ritter-x2a) Changes Pre-committing tests to show improvements in a follow-up PR with the combines. --- Full diff: https://github.com/llvm/llvm-project/pull/142738.diff 1 Files Affected: - (added) llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll (+207) ``diff diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll new file mode 100644 index 0..0241be9197e1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s + +; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG +; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable +; similar transformations in that pass. + +; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use. +define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_ZTwoUses: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_ZTwoUses: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24 + %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset + %l = load i64, ptr addrspace(1) %gep1, align 8 + %r = add i64 %l, %voffset + ret i64 %r +} + +define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %add0 = add nuw nsw i64 %voffset, 24 + %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 + %l = load i64, ptr addrspace(1) %gep0, align 8 + ret i64 %l +} + +; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These +; would be folded away in most cases, but the index computation introduced by +; the legalization of wide vector stores can for example introduce them. +define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) { +; GFX942_PTRADD-LABEL: store_v16i32: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23 +; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX942_PTRADD-NEXT:s_nop 1 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s17 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s18 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s19 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX942
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Fabian Ritter (ritter-x2a) Changes This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one that is closely connected. The generic DAG combine is based on a part of PR #105669 by @rgwott, which was adapted from work by @jrtc27, @arichardson, @davidchisnall in the CHERI/Morello LLVM tree. I added some parts and removed several disjuncts from the reassociation condition: - `isNullConstant(X)`, since there are address spaces where 0 is a perfectly normal value that shouldn't be treated specially, - `(YIsConstant && ZOneUse)` and `(N0OneUse && ZOneUse && !ZIsConstant)`, since they cause regressions in AMDGPU. For SWDEV-516125. --- Patch is 20.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142739.diff 4 Files Affected: - (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+91-1) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+49) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+1) - (modified) llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll (+60-133) ``diff diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 9e418329d15be..e45134a2e5548 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -419,6 +419,7 @@ namespace { SDValue visitADDLike(SDNode *N); SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); +SDValue visitPTRADD(SDNode *N); SDValue visitSUB(SDNode *N); SDValue visitADDSAT(SDNode *N); SDValue visitSUBSAT(SDNode *N); @@ -1138,7 +1139,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, return true; } - if (Opc != ISD::ADD) + if (Opc != ISD::ADD && Opc != ISD::PTRADD) return false; auto *C2 = dyn_cast(N1); @@ -1858,6 +1859,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::TokenFactor:return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD:return visitADD(N); + case ISD::PTRADD: return visitPTRADD(N); case ISD::SUB:return visitSUB(N); case ISD::SADDSAT: case ISD::UADDSAT:return visitADDSAT(N); @@ -2623,6 +2625,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) +return N1; + + if (N0.getOpcode() == ISD::PTRADD && + !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) { +SDValue X = N0.getOperand(0); +SDValue Y = N0.getOperand(1); +SDValue Z = N1; +bool N0OneUse = N0.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + +// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if: +// * y is a constant and (ptradd x, y) has one use; or +// * y and z are both constants. +if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && + N0->getFlags().hasNoUnsignedWrap()) +Flags |= SDNodeFlags::NoUnsignedWrap; + SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); + AddToWorklist(Add.getNode()); + return DAG.getMemBasePlusOffset(X, Add, DL, Flags); +} + +// TODO: There is another possible fold here that was proven useful. +// It would be this: +// +// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if: +// * (ptradd x, y) has one use; and +// * y is a constant; and +// * z is not a constant. +// +// In some cases, specifically in AArch64's FEAT_CPA, it exposes the +// opportunity to select more complex instructions such as SUBPT and +// MSUBPT. However, a hypothetical corner case has been found that we co
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
https://github.com/ritter-x2a ready_for_review https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Improve test coverage for G_INTTOPTR and G_PTRTOINT (PR #142603)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/142603 >From 3a47927dfaaa98cb0d2a336bfa416d2eb28e294d Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 3 Jun 2025 15:03:01 +0200 Subject: [PATCH] [AMDGPU] Improve test coverage for G_INTTOPTR and G_PTRTOINT Test P0 through P6 + P8 for both S/VGPRs. --- .../GlobalISel/regbankselect-inttoptr.mir | 236 +- .../GlobalISel/regbankselect-ptrtoint.mir | 232 - 2 files changed, 458 insertions(+), 10 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir index 053aede615f86..42600d7d0dd7a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir @@ -3,29 +3,29 @@ # RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s --- -name: inttoptr_s +name: inttoptr_s_p0 legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 -; CHECK-LABEL: name: inttoptr_s +; CHECK-LABEL: name: inttoptr_s_p0 ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 -; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p4) = G_INTTOPTR [[COPY]](s64) +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p0) = G_INTTOPTR [[COPY]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 -%1:_(p4) = G_INTTOPTR %0 +%1:_(p0) = G_INTTOPTR %0 ... --- -name: inttoptr_v +name: inttoptr_v_p0 legalized: true body: | bb.0: liveins: $vgpr0_vgpr1 -; CHECK-LABEL: name: inttoptr_v +; CHECK-LABEL: name: inttoptr_v_p0 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 @@ -33,3 +33,227 @@ body: | %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(p0) = G_INTTOPTR %0 ... + +--- +name: inttoptr_s_p1 +legalized: true + +body: | + bb.0: +liveins: $sgpr0_sgpr1 +; CHECK-LABEL: name: inttoptr_s_p1 +; CHECK: liveins: $sgpr0_sgpr1 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p1) = G_INTTOPTR [[COPY]](s64) +%0:_(s64) = COPY $sgpr0_sgpr1 +%1:_(p1) = G_INTTOPTR %0 +... + +--- +name: inttoptr_v_p1 +legalized: true + +body: | + bb.0: +liveins: $vgpr0_vgpr1 +; CHECK-LABEL: name: inttoptr_v_p1 +; CHECK: liveins: $vgpr0_vgpr1 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p1) = G_INTTOPTR [[COPY]](s64) +%0:_(s64) = COPY $vgpr0_vgpr1 +%1:_(p1) = G_INTTOPTR %0 +... + +--- +name: inttoptr_s_p2 +legalized: true + +body: | + bb.0: +liveins: $sgpr0 +; CHECK-LABEL: name: inttoptr_s_p2 +; CHECK: liveins: $sgpr0 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p2) = G_INTTOPTR [[COPY]](s32) +%0:_(s32) = COPY $sgpr0 +%1:_(p2) = G_INTTOPTR %0 +... + +--- +name: inttoptr_v_p2 +legalized: true + +body: | + bb.0: +liveins: $vgpr0 +; CHECK-LABEL: name: inttoptr_v_p2 +; CHECK: liveins: $vgpr0 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p2) = G_INTTOPTR [[COPY]](s32) +%0:_(s32) = COPY $vgpr0 +%1:_(p2) = G_INTTOPTR %0 +... + +--- +name: inttoptr_s_p3 +legalized: true + +body: | + bb.0: +liveins: $sgpr0 +; CHECK-LABEL: name: inttoptr_s_p3 +; CHECK: liveins: $sgpr0 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p3) = G_INTTOPTR [[COPY]](s32) +%0:_(s32) = COPY $sgpr0 +%1:_(p3) = G_INTTOPTR %0 +... + +--- +name: inttoptr_v_p3 +legalized: true + +body: | + bb.0: +liveins: $vgpr0 +; CHECK-LABEL: name: inttoptr_v_p3 +; CHECK: liveins: $vgpr0 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p3) = G_INTTOPTR [[COPY]](s32) +%0:_(s32) = COPY $vgpr0 +%1:_(p3) = G_INTTOPTR %0 +... + +--- +name: inttoptr_s_p4 +legalized: true + +body: | + bb.0: +liveins: $sgpr0_sgpr1 +; CHECK-LABEL: name: inttoptr_s_p4 +; CHECK: liveins: $sgpr0_sgpr1 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 +; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p4) = G_INTTOPTR [[COPY]](s64) +%0:_(s64) = COPY $sgpr0_sgpr1 +%1:_(p4) = G_INTTOPTR %0 +... + +--- +name: inttoptr_v_p4 +legalized: true + +body: | + bb.0: +liveins: $vgpr0_vgpr1 +; CHECK-LABEL: name: inttoptr_v_p4 +; CHECK: liveins: $vgpr0_vgpr1 +; CHECK-NEXT: {{ $}} +; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 +;
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)
https://github.com/ritter-x2a created https://github.com/llvm/llvm-project/pull/142738 Pre-committing tests to show improvements in a follow-up PR with the combines. >From d363847d4c4f3922875c23c69fd0e6e0148c7eff Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Tue, 3 Jun 2025 09:49:19 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines Pre-committing tests to show improvements in a follow-up PR with the combines. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 207 ++ 1 file changed, 207 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll new file mode 100644 index 0..0241be9197e1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s + +; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG +; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable +; similar transformations in that pass. + +; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use. +define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_ZTwoUses: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_ZTwoUses: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24 + %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset + %l = load i64, ptr addrspace(1) %gep1, align 8 + %r = add i64 %l, %voffset + ret i64 %r +} + +define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %add0 = add nuw nsw i64 %voffset, 24 + %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 + %l = load i64, ptr addrspace(1) %gep0, align 8 + ret i64 %l +} + +; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These +; would be folded away in most cases, but the index computation introduced by +; the legalization of wide vector stores can for example introduce them. +define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) { +; GFX942_PTRADD-LABEL: store_v16i32: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23 +; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX942_PTRADD-NEXT:s_nop 1 +;
[llvm-branch-commits] [llvm] [AMDGPU] Improve test coverage for G_INTTOPTR and G_PTRTOINT (PR #142603)
https://github.com/rovka approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/142603 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] New RegBankSelect: Add Ptr32/Ptr64/Ptr128 (PR #142602)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/142602 >From c69258d78459b8dcc89bec38a8a795763cd3dc80 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 3 Jun 2025 14:40:38 +0200 Subject: [PATCH] [AMDGPU] New RegBankSelect: Add Ptr32/Ptr64/Ptr128 There's quite a few opcodes that do not care about the exact AS of the pointer, just its size. Adding generic types for these will help reduce duplication in the rule definitions. I also moved the usual B types to use the new `isAnyPtr` helper I added to make sure they're supersets of the `Ptr` cases --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 42 +++ .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 29 +++-- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 19 + 3 files changed, 77 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 89af982636590..b2ddc6e88966b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -595,17 +595,23 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) { case VgprB32: case UniInVgprB32: if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) || -Ty == LLT::pointer(3, 32) || Ty == LLT::pointer(5, 32) || -Ty == LLT::pointer(6, 32)) +isAnyPtr(Ty, 32)) return Ty; return LLT(); + case SgprPtr32: + case VgprPtr32: +return isAnyPtr(Ty, 32) ? Ty : LLT(); + case SgprPtr64: + case VgprPtr64: +return isAnyPtr(Ty, 64) ? Ty : LLT(); + case SgprPtr128: + case VgprPtr128: +return isAnyPtr(Ty, 128) ? Ty : LLT(); case SgprB64: case VgprB64: case UniInVgprB64: if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) || -Ty == LLT::fixed_vector(4, 16) || Ty == LLT::pointer(0, 64) || -Ty == LLT::pointer(1, 64) || Ty == LLT::pointer(4, 64) || -(Ty.isPointer() && Ty.getAddressSpace() > AMDGPUAS::MAX_AMDGPU_ADDRESS)) +Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64)) return Ty; return LLT(); case SgprB96: @@ -619,7 +625,7 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) { case VgprB128: case UniInVgprB128: if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) || -Ty == LLT::fixed_vector(2, 64)) +Ty == LLT::fixed_vector(2, 64) || isAnyPtr(Ty, 128)) return Ty; return LLT(); case SgprB256: @@ -654,6 +660,9 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprP3: case SgprP4: case SgprP5: + case SgprPtr32: + case SgprPtr64: + case SgprPtr128: case SgprV2S16: case SgprV2S32: case SgprV4S32: @@ -688,6 +697,9 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case VgprP3: case VgprP4: case VgprP5: + case VgprPtr32: + case VgprPtr64: + case VgprPtr128: case VgprV2S16: case VgprV2S32: case VgprV4S32: @@ -754,12 +766,18 @@ void RegBankLegalizeHelper::applyMappingDst( case SgprB128: case SgprB256: case SgprB512: +case SgprPtr32: +case SgprPtr64: +case SgprPtr128: case VgprB32: case VgprB64: case VgprB96: case VgprB128: case VgprB256: -case VgprB512: { +case VgprB512: +case VgprPtr32: +case VgprPtr64: +case VgprPtr128: { assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty)); assert(RB == getRegBankFromID(MethodIDs[OpIdx])); break; @@ -864,7 +882,10 @@ void RegBankLegalizeHelper::applyMappingSrc( case SgprB96: case SgprB128: case SgprB256: -case SgprB512: { +case SgprB512: +case SgprPtr32: +case SgprPtr64: +case SgprPtr128: { assert(Ty == getBTyFromID(MethodIDs[i], Ty)); assert(RB == getRegBankFromID(MethodIDs[i])); break; @@ -895,7 +916,10 @@ void RegBankLegalizeHelper::applyMappingSrc( case VgprB96: case VgprB128: case VgprB256: -case VgprB512: { +case VgprB512: +case VgprPtr32: +case VgprPtr64: +case VgprPtr128: { assert(Ty == getBTyFromID(MethodIDs[i], Ty)); if (RB != VgprRB) { auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 672fc5b79abc2..5402129e41887 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -26,6 +26,10 @@ using namespace llvm; using namespace AMDGPU; +bool AMDGPU::isAnyPtr(LLT Ty, unsigned Width) { + return Ty.isPointer() && Ty.getSizeInBits() == Width; +} + RegBankLLTMapping::RegBankLLTMapping( std::initializer_list DstOpMappingList, std::initializer_list SrcOpMappingList, @@ -62,6 +66,12 @@ bool matchUniformityAndLLT(Register Reg
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)
ritter-x2a wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142738?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142739** https://app.graphite.dev/github/pr/llvm/llvm-project/142739?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142738** https://app.graphite.dev/github/pr/llvm/llvm-project/142738?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142738?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#141725** https://app.graphite.dev/github/pr/llvm/llvm-project/141725?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142738 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] New RegBankSelect: Add Ptr32/Ptr64/Ptr128 (PR #142602)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/142602 >From c69258d78459b8dcc89bec38a8a795763cd3dc80 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 3 Jun 2025 14:40:38 +0200 Subject: [PATCH] [AMDGPU] New RegBankSelect: Add Ptr32/Ptr64/Ptr128 There's quite a few opcodes that do not care about the exact AS of the pointer, just its size. Adding generic types for these will help reduce duplication in the rule definitions. I also moved the usual B types to use the new `isAnyPtr` helper I added to make sure they're supersets of the `Ptr` cases --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 42 +++ .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 29 +++-- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 19 + 3 files changed, 77 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 89af982636590..b2ddc6e88966b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -595,17 +595,23 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) { case VgprB32: case UniInVgprB32: if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) || -Ty == LLT::pointer(3, 32) || Ty == LLT::pointer(5, 32) || -Ty == LLT::pointer(6, 32)) +isAnyPtr(Ty, 32)) return Ty; return LLT(); + case SgprPtr32: + case VgprPtr32: +return isAnyPtr(Ty, 32) ? Ty : LLT(); + case SgprPtr64: + case VgprPtr64: +return isAnyPtr(Ty, 64) ? Ty : LLT(); + case SgprPtr128: + case VgprPtr128: +return isAnyPtr(Ty, 128) ? Ty : LLT(); case SgprB64: case VgprB64: case UniInVgprB64: if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) || -Ty == LLT::fixed_vector(4, 16) || Ty == LLT::pointer(0, 64) || -Ty == LLT::pointer(1, 64) || Ty == LLT::pointer(4, 64) || -(Ty.isPointer() && Ty.getAddressSpace() > AMDGPUAS::MAX_AMDGPU_ADDRESS)) +Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64)) return Ty; return LLT(); case SgprB96: @@ -619,7 +625,7 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) { case VgprB128: case UniInVgprB128: if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) || -Ty == LLT::fixed_vector(2, 64)) +Ty == LLT::fixed_vector(2, 64) || isAnyPtr(Ty, 128)) return Ty; return LLT(); case SgprB256: @@ -654,6 +660,9 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprP3: case SgprP4: case SgprP5: + case SgprPtr32: + case SgprPtr64: + case SgprPtr128: case SgprV2S16: case SgprV2S32: case SgprV4S32: @@ -688,6 +697,9 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case VgprP3: case VgprP4: case VgprP5: + case VgprPtr32: + case VgprPtr64: + case VgprPtr128: case VgprV2S16: case VgprV2S32: case VgprV4S32: @@ -754,12 +766,18 @@ void RegBankLegalizeHelper::applyMappingDst( case SgprB128: case SgprB256: case SgprB512: +case SgprPtr32: +case SgprPtr64: +case SgprPtr128: case VgprB32: case VgprB64: case VgprB96: case VgprB128: case VgprB256: -case VgprB512: { +case VgprB512: +case VgprPtr32: +case VgprPtr64: +case VgprPtr128: { assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty)); assert(RB == getRegBankFromID(MethodIDs[OpIdx])); break; @@ -864,7 +882,10 @@ void RegBankLegalizeHelper::applyMappingSrc( case SgprB96: case SgprB128: case SgprB256: -case SgprB512: { +case SgprB512: +case SgprPtr32: +case SgprPtr64: +case SgprPtr128: { assert(Ty == getBTyFromID(MethodIDs[i], Ty)); assert(RB == getRegBankFromID(MethodIDs[i])); break; @@ -895,7 +916,10 @@ void RegBankLegalizeHelper::applyMappingSrc( case VgprB96: case VgprB128: case VgprB256: -case VgprB512: { +case VgprB512: +case VgprPtr32: +case VgprPtr64: +case VgprPtr128: { assert(Ty == getBTyFromID(MethodIDs[i], Ty)); if (RB != VgprRB) { auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 672fc5b79abc2..5402129e41887 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -26,6 +26,10 @@ using namespace llvm; using namespace AMDGPU; +bool AMDGPU::isAnyPtr(LLT Ty, unsigned Width) { + return Ty.isPointer() && Ty.getSizeInBits() == Width; +} + RegBankLLTMapping::RegBankLLTMapping( std::initializer_list DstOpMappingList, std::initializer_list SrcOpMappingList, @@ -62,6 +66,12 @@ bool matchUniformityAndLLT(Register Reg
[llvm-branch-commits] [llvm] [AMDGPU] New RegBanKSelect: Add S128 types (PR #142601)
https://github.com/Pierre-vh edited https://github.com/llvm/llvm-project/pull/142601 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [AArch64][SME] Fix accessing the emergency spill slot with hazard padding (#142190) (PR #142741)
https://github.com/MacDue commented: cc @efriedma-quic https://github.com/llvm/llvm-project/pull/142741 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)
@@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s + +; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG +; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable +; similar transformations in that pass. + +; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use. +define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_ZTwoUses: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_ZTwoUses: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24 + %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset + %l = load i64, ptr addrspace(1) %gep1, align 8 + %r = add i64 %l, %voffset + ret i64 %r +} + +define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %add0 = add nuw nsw i64 %voffset, 24 + %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 + %l = load i64, ptr addrspace(1) %gep0, align 8 + ret i64 %l +} + +; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These +; would be folded away in most cases, but the index computation introduced by +; the legalization of wide vector stores can for example introduce them. +define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) { +; GFX942_PTRADD-LABEL: store_v16i32: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23 +; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX942_PTRADD-NEXT:s_nop 1 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s17 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s18 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s19 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX942_PTRADD-NEXT:s_nop 1 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s12 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s13 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s14 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s15 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX942_PTRADD-NEXT:s_nop 1 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s8 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s9 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s10 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s11 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942_PTRAD
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
ritter-x2a wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142739?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142739** https://app.graphite.dev/github/pr/llvm/llvm-project/142739?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142739?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142738** https://app.graphite.dev/github/pr/llvm/llvm-project/142738?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#141725** https://app.graphite.dev/github/pr/llvm/llvm-project/141725?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64][SME] Support split ZPR and PPR area allocation (PR #142392)
MacDue wrote: > If you express the size of the hazard padding between the PPRs and ZPRs as a > scalable size, that might simplify some of the logic? You wouldn't need to > represent the two areas as separate stacks, at least. It would, but for the sizes of hazard padding and vscale we're interested in, it would result in a much larger allocation than necessary and likely complicate addressing predicates and vectors more so (due to limited ranges for scalable offsets). https://github.com/llvm/llvm-project/pull/142392 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen] Limit number of analyzed predecessors (PR #142584)
https://github.com/aengelke updated https://github.com/llvm/llvm-project/pull/142584 >From 4cbc231699c11444cff73ff28b88dc0f3835c752 Mon Sep 17 00:00:00 2001 From: Alexis Engelke Date: Wed, 4 Jun 2025 09:21:02 + Subject: [PATCH] Move one check to beginning of function Created using spr 1.3.5-bogner --- llvm/lib/CodeGen/MachineBlockPlacement.cpp | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index e96f3f8193b09..2dbabfe345d5e 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -1483,6 +1483,11 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( if (SuccChain.UnscheduledPredecessors == 0) return false; + // Compile-time optimization: runtime is quadratic in the number of + // predecessors. For such uncommon cases, exit early. + if (Succ->pred_size() > PredecessorLimit) +return false; + // There are two basic scenarios here: // - // Case 1: triangular shape CFG (if-then): @@ -1603,11 +1608,6 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb; bool BadCFGConflict = false; - // Compile-time optimization: runtime is quadratic in the number of - // predecessors. For such uncommon cases, exit early. - if (Succ->pred_size() > PredecessorLimit) -return false; - for (MachineBasicBlock *Pred : Succ->predecessors()) { BlockChain *PredChain = BlockToChain[Pred]; if (Pred == Succ || PredChain == &SuccChain || ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang] Enable delayed localization by default for `do concurrent` (PR #142567)
https://github.com/tblah approved this pull request. https://github.com/llvm/llvm-project/pull/142567 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CodeGen] Limit number of analyzed predecessors (PR #142584)
@@ -1030,6 +1036,11 @@ bool MachineBlockPlacement::isTrellis( SmallPtrSet SeenPreds; for (MachineBasicBlock *Succ : ViableSuccs) { +// Compile-time optimization: runtime is quadratic in the number of +// predecessors. For such uncommon cases, exit early. +if (Succ->pred_size() > PredecessorLimit) aengelke wrote: Consider the code from the test generator below. From my understanding, `buildChain` will iterate over all basic blocks of the chain and call `selectBestSuccessor` for each of them, which will in turn call `isTrellis` for every block. `isTrellis` will look at the predecessors of all successors, in particular, it will look at all predecessors of the `merge` block, which are all the other blocks => for almost every block, the code looks at almost all other blocks. Test generator, try n=4: ```python import sys n = int(sys.argv[1]) print("declare void @exit(i32)") print("declare i1 @cond(i32)") print("define i32 @f(i32 %v, i32 %v0) {") for i in range(n): print(f'b{i}:') print(f' %v{i+1} = add i32 %v{i}, %v') print(f' %c{i} = call i1 @cond(i32 %v{i+1})') print(f' br i1 %c{i}, label %merge, label %b{i+1}') print(f'b{n}:') print(f' ret i32 %v{n}') print('merge:') print(' call void @exit(i32 1)') print(' unreachable') print('}') ``` ```console # Without this change $ python3 many-preds3.test 4 | /usr/bin/time ./llvm-build/bin/llc -filetype=obj -o /dev/null -O1 15.93user 0.17system 0:16.18elapsed 99%CPU (0avgtext+0avgdata 457748maxresident)k 0inputs+0outputs (0major+99524minor)pagefaults 0swaps # With this change $ python3 many-preds3.test 4 | /usr/bin/time ./llvm-build/bin/llc -filetype=obj -o /dev/null -O1 8.82user 0.19system 0:09.10elapsed 99%CPU (0avgtext+0avgdata 457240maxresident)k 0inputs+0outputs (0major+99425minor)pagefaults 0swaps ``` https://github.com/llvm/llvm-project/pull/142584 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CVP] Keep `ReachableCaseCount` in sync with range of condition (#142302) (PR #142730)
https://github.com/dtcxzyw created https://github.com/llvm/llvm-project/pull/142730 Backport https://github.com/llvm/llvm-project/commit/0f7cc4132b62e0ecdbd3193e954b745c5f492e90. https://github.com/llvm/llvm-project/pull/79993 assumes that a reachable case must be contained by `CR`. However, it doesn't hold for some edge cases. This patch adds additional checks to ensure `ReachableCaseCount` is correct. Note: Similar optimization in SCCP isn't affected by this bug because it uses `CR` to compute `ReachableCaseCount`. Closes https://github.com/llvm/llvm-project/issues/142286. >From 9f73052846c60357a38e0259eba1675f9b14b8c7 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Mon, 2 Jun 2025 17:42:02 +0800 Subject: [PATCH] [CVP] Keep `ReachableCaseCount` in sync with range of condition (#142302) https://github.com/llvm/llvm-project/pull/79993 assumes that a reachable case must be contained by `CR`. However, it doesn't hold for some edge cases. This patch adds additional checks to ensure `ReachableCaseCount` is correct. Note: Similar optimization in SCCP isn't affected by this bug because it uses `CR` to compute `ReachableCaseCount`. Closes https://github.com/llvm/llvm-project/issues/142286. --- .../Scalar/CorrelatedValuePropagation.cpp | 59 +++ .../CorrelatedValuePropagation/switch.ll | 36 +++ 2 files changed, 71 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 8e74b8645fad9..86c4170b9a977 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -370,15 +370,30 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, { // Scope for SwitchInstProfUpdateWrapper. It must not live during // ConstantFoldTerminator() as the underlying SwitchInst can be changed. SwitchInstProfUpdateWrapper SI(*I); +ConstantRange CR = +LVI->getConstantRangeAtUse(I->getOperandUse(0), /*UndefAllowed=*/false); unsigned ReachableCaseCount = 0; for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) { ConstantInt *Case = CI->getCaseValue(); - auto *Res = dyn_cast_or_null( - LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I, - /* UseBlockValue */ true)); + std::optional Predicate = std::nullopt; + if (!CR.contains(Case->getValue())) +Predicate = false; + else if (CR.isSingleElement() && + *CR.getSingleElement() == Case->getValue()) +Predicate = true; + if (!Predicate) { +// Handle missing cases, e.g., the range has a hole. +auto *Res = dyn_cast_or_null( +LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I, +/* UseBlockValue=*/true)); +if (Res && Res->isZero()) + Predicate = false; +else if (Res && Res->isOne()) + Predicate = true; + } - if (Res && Res->isZero()) { + if (Predicate && !*Predicate) { // This case never fires - remove it. BasicBlock *Succ = CI->getCaseSuccessor(); Succ->removePredecessor(BB); @@ -395,7 +410,7 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, DTU.applyUpdatesPermissive({{DominatorTree::Delete, BB, Succ}}); continue; } - if (Res && Res->isOne()) { + if (Predicate && *Predicate) { // This case always fires. Arrange for the switch to be turned into an // unconditional branch by replacing the switch condition with the case // value. @@ -410,28 +425,24 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, ++ReachableCaseCount; } -BasicBlock *DefaultDest = SI->getDefaultDest(); -if (ReachableCaseCount > 1 && -!isa(DefaultDest->getFirstNonPHIOrDbg())) { - ConstantRange CR = LVI->getConstantRangeAtUse(I->getOperandUse(0), -/*UndefAllowed*/ false); - // The default dest is unreachable if all cases are covered. - if (!CR.isSizeLargerThan(ReachableCaseCount)) { -BasicBlock *NewUnreachableBB = -BasicBlock::Create(BB->getContext(), "default.unreachable", - BB->getParent(), DefaultDest); -new UnreachableInst(BB->getContext(), NewUnreachableBB); +// The default dest is unreachable if all cases are covered. +if (!SI->defaultDestUndefined() && +!CR.isSizeLargerThan(ReachableCaseCount)) { + BasicBlock *DefaultDest = SI->getDefaultDest(); + BasicBlock *NewUnreachableBB = + BasicBlock::Create(BB->getContext(), "default.unreachable", + BB->getParent(), DefaultDest); + new UnreachableInst(BB->getContext(), NewUnreachableBB); -DefaultDest->removePredecessor(BB); -SI->setDe
[llvm-branch-commits] [CI] Explicitly compute needed runtime targets (PR #142695)
https://github.com/DavidSpickett commented: Agree that `_compute_runtimes_to_test` is a confusing name but if it's going to change anyway, this LGTM. https://github.com/llvm/llvm-project/pull/142695 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CI] Explicitly compute needed runtime targets (PR #142695)
https://github.com/DavidSpickett approved this pull request. https://github.com/llvm/llvm-project/pull/142695 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CI] Use LLVM_ENABLE_RUNTIMES for runtimes builds on Linux (PR #142694)
Endilll wrote: > I'm not sure how you're seeing this.   https://github.com/llvm/llvm-project/pull/142694 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][OpenMP] Map basic `local` specifiers to `private` clauses (PR #142735)
https://github.com/tblah approved this pull request. Looks great! https://github.com/llvm/llvm-project/pull/142735 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142739 >From efdaf03ba25edfd254a3b9bc79470ed861e123c1 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 03:32:32 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one that is closely connected. The generic DAG combine is based on a part of PR #105669 by @rgwott, which was adapted from work by @jrtc27, @arichardson, @davidchisnall in the CHERI/Morello LLVM tree. I added some parts and removed several disjuncts from the reassociation condition: - `isNullConstant(X)`, since there are address spaces where 0 is a perfectly normal value that shouldn't be treated specially, - `(YIsConstant && ZOneUse)` and `(N0OneUse && ZOneUse && !ZIsConstant)`, since they cause regressions in AMDGPU. For SWDEV-516125. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 92 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 49 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + .../AMDGPU/ptradd-sdag-optimizations.ll | 193 ++ 4 files changed, 201 insertions(+), 134 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index aba3c0f80a024..e57e8eb8799e2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -419,6 +419,7 @@ namespace { SDValue visitADDLike(SDNode *N); SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); +SDValue visitPTRADD(SDNode *N); SDValue visitSUB(SDNode *N); SDValue visitADDSAT(SDNode *N); SDValue visitSUBSAT(SDNode *N); @@ -1138,7 +1139,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, return true; } - if (Opc != ISD::ADD) + if (Opc != ISD::ADD && Opc != ISD::PTRADD) return false; auto *C2 = dyn_cast(N1); @@ -1858,6 +1859,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::TokenFactor:return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD:return visitADD(N); + case ISD::PTRADD: return visitPTRADD(N); case ISD::SUB:return visitSUB(N); case ISD::SADDSAT: case ISD::UADDSAT:return visitADDSAT(N); @@ -2627,6 +2629,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) +return N1; + + if (N0.getOpcode() == ISD::PTRADD && + !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) { +SDValue X = N0.getOperand(0); +SDValue Y = N0.getOperand(1); +SDValue Z = N1; +bool N0OneUse = N0.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + +// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if: +// * y is a constant and (ptradd x, y) has one use; or +// * y and z are both constants. +if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && + N0->getFlags().hasNoUnsignedWrap()) +Flags |= SDNodeFlags::NoUnsignedWrap; + SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); + AddToWorklist(Add.getNode()); + return DAG.getMemBasePlusOffset(X, Add, DL, Flags); +} + +// TODO: There is another possible fold here that was proven useful. +// It would be this: +// +// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if: +// * (ptradd x, y) has one use; and +// * y is a constant; and +// * z is not a constant. +// +// In some cases, specifically in AArch64's FEAT_CPA, it exposes the +// opportunity to select more complex instructions such as SUBPT and +// MSUBPT. Howev
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142738 >From 8f51e2d76b4336f81027905b3c9b711eac7b6406 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Tue, 3 Jun 2025 09:49:19 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines Pre-committing tests to show improvements in a follow-up PR with the combines. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 207 ++ 1 file changed, 207 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll new file mode 100644 index 0..0241be9197e1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s + +; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG +; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable +; similar transformations in that pass. + +; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use. +define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_ZTwoUses: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_ZTwoUses: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24 + %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset + %l = load i64, ptr addrspace(1) %gep1, align 8 + %r = add i64 %l, %voffset + ret i64 %r +} + +define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %add0 = add nuw nsw i64 %voffset, 24 + %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 + %l = load i64, ptr addrspace(1) %gep0, align 8 + ret i64 %l +} + +; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These +; would be folded away in most cases, but the index computation introduced by +; the legalization of wide vector stores can for example introduce them. +define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) { +; GFX942_PTRADD-LABEL: store_v16i32: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23 +; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX942_PTRADD-NEXT:s_nop 1 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16 +; GFX942_PTRADD-NEXT:v_mov_b3
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142739 >From efdaf03ba25edfd254a3b9bc79470ed861e123c1 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 03:32:32 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one that is closely connected. The generic DAG combine is based on a part of PR #105669 by @rgwott, which was adapted from work by @jrtc27, @arichardson, @davidchisnall in the CHERI/Morello LLVM tree. I added some parts and removed several disjuncts from the reassociation condition: - `isNullConstant(X)`, since there are address spaces where 0 is a perfectly normal value that shouldn't be treated specially, - `(YIsConstant && ZOneUse)` and `(N0OneUse && ZOneUse && !ZIsConstant)`, since they cause regressions in AMDGPU. For SWDEV-516125. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 92 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 49 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + .../AMDGPU/ptradd-sdag-optimizations.ll | 193 ++ 4 files changed, 201 insertions(+), 134 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index aba3c0f80a024..e57e8eb8799e2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -419,6 +419,7 @@ namespace { SDValue visitADDLike(SDNode *N); SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); +SDValue visitPTRADD(SDNode *N); SDValue visitSUB(SDNode *N); SDValue visitADDSAT(SDNode *N); SDValue visitSUBSAT(SDNode *N); @@ -1138,7 +1139,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, return true; } - if (Opc != ISD::ADD) + if (Opc != ISD::ADD && Opc != ISD::PTRADD) return false; auto *C2 = dyn_cast(N1); @@ -1858,6 +1859,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::TokenFactor:return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD:return visitADD(N); + case ISD::PTRADD: return visitPTRADD(N); case ISD::SUB:return visitSUB(N); case ISD::SADDSAT: case ISD::UADDSAT:return visitADDSAT(N); @@ -2627,6 +2629,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) +return N1; + + if (N0.getOpcode() == ISD::PTRADD && + !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) { +SDValue X = N0.getOperand(0); +SDValue Y = N0.getOperand(1); +SDValue Z = N1; +bool N0OneUse = N0.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + +// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if: +// * y is a constant and (ptradd x, y) has one use; or +// * y and z are both constants. +if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && + N0->getFlags().hasNoUnsignedWrap()) +Flags |= SDNodeFlags::NoUnsignedWrap; + SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); + AddToWorklist(Add.getNode()); + return DAG.getMemBasePlusOffset(X, Add, DL, Flags); +} + +// TODO: There is another possible fold here that was proven useful. +// It would be this: +// +// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if: +// * (ptradd x, y) has one use; and +// * y is a constant; and +// * z is not a constant. +// +// In some cases, specifically in AArch64's FEAT_CPA, it exposes the +// opportunity to select more complex instructions such as SUBPT and +// MSUBPT. Howev
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142738 >From 8f51e2d76b4336f81027905b3c9b711eac7b6406 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Tue, 3 Jun 2025 09:49:19 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines Pre-committing tests to show improvements in a follow-up PR with the combines. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 207 ++ 1 file changed, 207 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll new file mode 100644 index 0..0241be9197e1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s + +; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG +; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable +; similar transformations in that pass. + +; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use. +define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_ZTwoUses: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_ZTwoUses: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24 + %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset + %l = load i64, ptr addrspace(1) %gep1, align 8 + %r = add i64 %l, %voffset + ret i64 %r +} + +define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %add0 = add nuw nsw i64 %voffset, 24 + %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 + %l = load i64, ptr addrspace(1) %gep0, align 8 + ret i64 %l +} + +; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These +; would be folded away in most cases, but the index computation introduced by +; the legalization of wide vector stores can for example introduce them. +define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) { +; GFX942_PTRADD-LABEL: store_v16i32: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23 +; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX942_PTRADD-NEXT:s_nop 1 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16 +; GFX942_PTRADD-NEXT:v_mov_b3
[llvm-branch-commits] [CI] Use LLVM_ENABLE_RUNTIMES for runtimes builds on Linux (PR #142694)
cor3ntin wrote: This is a merge commit, it might be worth rebasing / cherry picking on a clean branch https://github.com/llvm/llvm-project/pull/142694 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CI] Use LLVM_ENABLE_RUNTIMES for runtimes builds on Linux (PR #142694)
llvmbot wrote: @llvm/pr-subscribers-libcxx Author: Aiden Grossman (boomanaiden154) Changes This patch switches us to using LLVM_ENABLE_RUNTIMES rather than using separate runtimes builds for some reductions in CMake configuration time and some simplification of the monolithic-linux.sh script. --- Full diff: https://github.com/llvm/llvm-project/pull/142694.diff 1 Files Affected: - (modified) .ci/monolithic-linux.sh (+10-36) ``diff diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index f5a31fa45a641..52a80958b4025 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -102,51 +102,25 @@ if [[ "${runtimes}" != "" ]]; then exit 1 fi - echo "--- ninja install-clang" - - ninja -C ${BUILD_DIR} install-clang install-clang-resource-headers - - RUNTIMES_BUILD_DIR="${MONOREPO_ROOT}/build-runtimes" - INSTALL_DIR="${BUILD_DIR}/install" - mkdir -p ${RUNTIMES_BUILD_DIR} - echo "--- cmake runtimes C++26" - rm -rf "${RUNTIMES_BUILD_DIR}" - cmake -S "${MONOREPO_ROOT}/runtimes" -B "${RUNTIMES_BUILD_DIR}" -GNinja \ - -D CMAKE_C_COMPILER="${INSTALL_DIR}/bin/clang" \ - -D CMAKE_CXX_COMPILER="${INSTALL_DIR}/bin/clang++" \ - -D LLVM_ENABLE_RUNTIMES="${runtimes}" \ - -D LIBCXX_CXX_ABI=libcxxabi \ - -D CMAKE_BUILD_TYPE=RelWithDebInfo \ - -D CMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \ - -D LIBCXX_TEST_PARAMS="std=c++26" \ - -D LIBCXXABI_TEST_PARAMS="std=c++26" \ - -D LLVM_LIT_ARGS="${lit_args}" + cmake \ +-D LIBCXX_TEST_PARAMS="std=c++26" \ +-D LIBCXXABI_TEST_PARAMS="std=c++26" \ +"${BUILD_DIR}" echo "--- ninja runtimes C++26" - ninja -vC "${RUNTIMES_BUILD_DIR}" ${runtime_targets} + ninja -C "${BUILD_DIR}" ${runtime_targets} echo "--- cmake runtimes clang modules" - # We don't need to do a clean build of runtimes, because LIBCXX_TEST_PARAMS - # and LIBCXXABI_TEST_PARAMS only affect lit configuration, which successfully - # propagates without a clean build. Other that those two variables, builds - # are supposed to be the same. - - cmake -S "${MONOREPO_ROOT}/runtimes" -B "${RUNTIMES_BUILD_DIR}" -GNinja \ - -D CMAKE_C_COMPILER="${INSTALL_DIR}/bin/clang" \ - -D CMAKE_CXX_COMPILER="${INSTALL_DIR}/bin/clang++" \ - -D LLVM_ENABLE_RUNTIMES="${runtimes}" \ - -D LIBCXX_CXX_ABI=libcxxabi \ - -D CMAKE_BUILD_TYPE=RelWithDebInfo \ - -D CMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \ - -D LIBCXX_TEST_PARAMS="enable_modules=clang" \ - -D LIBCXXABI_TEST_PARAMS="enable_modules=clang" \ - -D LLVM_LIT_ARGS="${lit_args}" + cmake \ +-D LIBCXX_TEST_PARAMS="enable_modules=clang" \ +-D LIBCXXABI_TEST_PARAMS="enable_modules=clang" \ +"${BUILD_DIR}" echo "--- ninja runtimes clang modules" - ninja -vC "${RUNTIMES_BUILD_DIR}" ${runtime_targets} + ninja -C "${BUILD_DIR}" ${runtime_targets} fi `` https://github.com/llvm/llvm-project/pull/142694 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen] Limit number of analyzed predecessors (PR #142584)
@@ -1592,6 +1603,11 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb; bool BadCFGConflict = false; + // Compile-time optimization: runtime is quadratic in the number of aengelke wrote: Done https://github.com/llvm/llvm-project/pull/142584 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` (PR #142422)
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/142422 >From 887f383aa07cca3fe023cd64b3b119cbf013c17b Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 2 Jun 2025 15:13:13 + Subject: [PATCH 1/3] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` Previously, slices were sometimes marked as non-contiguous when they were actually contiguous. This occurred when the vector type had leading unit dimensions, e.g., `vector<1x1x...x1xd0xd1x...xdn-1xT>``. In such cases, only the trailing n dimensions of the memref need to be contiguous, not the entire vector rank. This affects how `FlattenContiguousRowMajorTransfer{Read,Write}Pattern` flattens `transfer_read` and `transfer_write`` ops. The pattern used to collapse a number of dimensions equal the vector rank, which may be is incorrect when leading dimensions are unit-sized. This patch fixes the issue by collapsing only as many trailing memref dimensions as are actually contiguous. --- .../mlir/Dialect/Vector/Utils/VectorUtils.h | 54 - .../Transforms/VectorTransferOpTransforms.cpp | 8 +- mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp | 25 ++-- .../Vector/vector-transfer-flatten.mlir | 108 +- 4 files changed, 120 insertions(+), 75 deletions(-) diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h index 6609b28d77b6c..ed06d7a029494 100644 --- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h +++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h @@ -49,35 +49,37 @@ FailureOr> isTranspose2DSlice(vector::TransposeOp op); /// Return true if `vectorType` is a contiguous slice of `memrefType`. /// -/// Only the N = vectorType.getRank() trailing dims of `memrefType` are -/// checked (the other dims are not relevant). Note that for `vectorType` to be -/// a contiguous slice of `memrefType`, the trailing dims of the latter have -/// to be contiguous - this is checked by looking at the corresponding strides. +/// The leading unit dimensions of the vector type are ignored as they +/// are not relevant to the result. Let N be the number of the vector +/// dimensions after ignoring a leading sequence of unit ones. /// -/// There might be some restriction on the leading dim of `VectorType`: +/// For `vectorType` to be a contiguous slice of `memrefType` +/// a) the N trailing dimensions of the latter must be contiguous, and +/// b) the trailing N dimensions of `vectorType` and `memrefType`, +/// except the first of them, must match. /// -/// Case 1. If all the trailing dims of `vectorType` match the trailing dims -/// of `memrefType` then the leading dim of `vectorType` can be -/// arbitrary. -/// -///Ex. 1.1 contiguous slice, perfect match -/// vector<4x3x2xi32> from memref<5x4x3x2xi32> -///Ex. 1.2 contiguous slice, the leading dim does not match (2 != 4) -/// vector<2x3x2xi32> from memref<5x4x3x2xi32> -/// -/// Case 2. If an "internal" dim of `vectorType` does not match the -/// corresponding trailing dim in `memrefType` then the remaining -/// leading dims of `vectorType` have to be 1 (the first non-matching -/// dim can be arbitrary). +/// Examples: /// -///Ex. 2.1 non-contiguous slice, 2 != 3 and the leading dim != <1> -/// vector<2x2x2xi32> from memref<5x4x3x2xi32> -///Ex. 2.2 contiguous slice, 2 != 3 and the leading dim == <1> -/// vector<1x2x2xi32> from memref<5x4x3x2xi32> -///Ex. 2.3. contiguous slice, 2 != 3 and the leading dims == <1x1> -/// vector<1x1x2x2xi32> from memref<5x4x3x2xi32> -///Ex. 2.4. non-contiguous slice, 2 != 3 and the leading dims != <1x1> -/// vector<2x1x2x2xi32> from memref<5x4x3x2xi32>) +/// Ex.1 contiguous slice, perfect match +/// vector<4x3x2xi32> from memref<5x4x3x2xi32> +/// Ex.2 contiguous slice, the leading dim does not match (2 != 4) +/// vector<2x3x2xi32> from memref<5x4x3x2xi32> +/// Ex.3 non-contiguous slice, 2 != 3 +/// vector<2x2x2xi32> from memref<5x4x3x2xi32> +/// Ex.4 contiguous slice, leading unit dimension of the vector ignored, +///2 != 3 (allowed) +/// vector<1x2x2xi32> from memref<5x4x3x2xi32> +/// Ex.5. contiguous slice, leasing two unit dims of the vector ignored, +/// 2 != 3 (allowed) +/// vector<1x1x2x2xi32> from memref<5x4x3x2xi32> +/// Ex.6. non-contiguous slice, 2 != 3, no leading sequence of unit dims +/// vector<2x1x2x2xi32> from memref<5x4x3x2xi32>) +/// Ex.7 contiguous slice, memref needs to be contiguous only on the last +///dimension +/// vector<1x1x2xi32> from memref<2x2x2xi32, strided<[8, 4, 1]>> +/// Ex.8 non-contiguous slice, memref needs to be contiguous one the last +///two dimensions, and it isn't +/// vector<1x2x2xi32> from memref<2x2x2xi32, strided<[8, 4, 1]>> bool isContiguo
[llvm-branch-commits] [mlir] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` (PR #142422)
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/142422 >From 887f383aa07cca3fe023cd64b3b119cbf013c17b Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 2 Jun 2025 15:13:13 + Subject: [PATCH 1/3] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` Previously, slices were sometimes marked as non-contiguous when they were actually contiguous. This occurred when the vector type had leading unit dimensions, e.g., `vector<1x1x...x1xd0xd1x...xdn-1xT>``. In such cases, only the trailing n dimensions of the memref need to be contiguous, not the entire vector rank. This affects how `FlattenContiguousRowMajorTransfer{Read,Write}Pattern` flattens `transfer_read` and `transfer_write`` ops. The pattern used to collapse a number of dimensions equal the vector rank, which may be is incorrect when leading dimensions are unit-sized. This patch fixes the issue by collapsing only as many trailing memref dimensions as are actually contiguous. --- .../mlir/Dialect/Vector/Utils/VectorUtils.h | 54 - .../Transforms/VectorTransferOpTransforms.cpp | 8 +- mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp | 25 ++-- .../Vector/vector-transfer-flatten.mlir | 108 +- 4 files changed, 120 insertions(+), 75 deletions(-) diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h index 6609b28d77b6c..ed06d7a029494 100644 --- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h +++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h @@ -49,35 +49,37 @@ FailureOr> isTranspose2DSlice(vector::TransposeOp op); /// Return true if `vectorType` is a contiguous slice of `memrefType`. /// -/// Only the N = vectorType.getRank() trailing dims of `memrefType` are -/// checked (the other dims are not relevant). Note that for `vectorType` to be -/// a contiguous slice of `memrefType`, the trailing dims of the latter have -/// to be contiguous - this is checked by looking at the corresponding strides. +/// The leading unit dimensions of the vector type are ignored as they +/// are not relevant to the result. Let N be the number of the vector +/// dimensions after ignoring a leading sequence of unit ones. /// -/// There might be some restriction on the leading dim of `VectorType`: +/// For `vectorType` to be a contiguous slice of `memrefType` +/// a) the N trailing dimensions of the latter must be contiguous, and +/// b) the trailing N dimensions of `vectorType` and `memrefType`, +/// except the first of them, must match. /// -/// Case 1. If all the trailing dims of `vectorType` match the trailing dims -/// of `memrefType` then the leading dim of `vectorType` can be -/// arbitrary. -/// -///Ex. 1.1 contiguous slice, perfect match -/// vector<4x3x2xi32> from memref<5x4x3x2xi32> -///Ex. 1.2 contiguous slice, the leading dim does not match (2 != 4) -/// vector<2x3x2xi32> from memref<5x4x3x2xi32> -/// -/// Case 2. If an "internal" dim of `vectorType` does not match the -/// corresponding trailing dim in `memrefType` then the remaining -/// leading dims of `vectorType` have to be 1 (the first non-matching -/// dim can be arbitrary). +/// Examples: /// -///Ex. 2.1 non-contiguous slice, 2 != 3 and the leading dim != <1> -/// vector<2x2x2xi32> from memref<5x4x3x2xi32> -///Ex. 2.2 contiguous slice, 2 != 3 and the leading dim == <1> -/// vector<1x2x2xi32> from memref<5x4x3x2xi32> -///Ex. 2.3. contiguous slice, 2 != 3 and the leading dims == <1x1> -/// vector<1x1x2x2xi32> from memref<5x4x3x2xi32> -///Ex. 2.4. non-contiguous slice, 2 != 3 and the leading dims != <1x1> -/// vector<2x1x2x2xi32> from memref<5x4x3x2xi32>) +/// Ex.1 contiguous slice, perfect match +/// vector<4x3x2xi32> from memref<5x4x3x2xi32> +/// Ex.2 contiguous slice, the leading dim does not match (2 != 4) +/// vector<2x3x2xi32> from memref<5x4x3x2xi32> +/// Ex.3 non-contiguous slice, 2 != 3 +/// vector<2x2x2xi32> from memref<5x4x3x2xi32> +/// Ex.4 contiguous slice, leading unit dimension of the vector ignored, +///2 != 3 (allowed) +/// vector<1x2x2xi32> from memref<5x4x3x2xi32> +/// Ex.5. contiguous slice, leasing two unit dims of the vector ignored, +/// 2 != 3 (allowed) +/// vector<1x1x2x2xi32> from memref<5x4x3x2xi32> +/// Ex.6. non-contiguous slice, 2 != 3, no leading sequence of unit dims +/// vector<2x1x2x2xi32> from memref<5x4x3x2xi32>) +/// Ex.7 contiguous slice, memref needs to be contiguous only on the last +///dimension +/// vector<1x1x2xi32> from memref<2x2x2xi32, strided<[8, 4, 1]>> +/// Ex.8 non-contiguous slice, memref needs to be contiguous one the last +///two dimensions, and it isn't +/// vector<1x2x2xi32> from memref<2x2x2xi32, strided<[8, 4, 1]>> bool isContiguo
[llvm-branch-commits] [CI] Use LLVM_ENABLE_RUNTIMES for runtimes builds on Linux (PR #142694)
Endilll wrote: I just opened 9e3490b51f85d1aff3978dc32aadde4531363774 in my local git, and yes, all libc++ changes are there alongside changes to `monolithic-linux.sh` that we're interested in https://github.com/llvm/llvm-project/pull/142694 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)
https://github.com/ritter-x2a ready_for_review https://github.com/llvm/llvm-project/pull/142738 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] New RegBankSelect: Add missing S/VGPR pointer types (PR #142600)
https://github.com/Pierre-vh closed https://github.com/llvm/llvm-project/pull/142600 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] New RegBanKSelect: Add S128 types (PR #142601)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/142601 >From 96669eee5e756faed679480521faafd9f1bad9d1 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 3 Jun 2025 13:27:55 +0200 Subject: [PATCH] [AMDGPU] New RegBanKSelect: Add S128 types --- llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 9 + llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp | 6 ++ llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h| 5 + 3 files changed, 20 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 7ff822c6f6580..89af982636590 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -556,6 +556,9 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case Sgpr64: case Vgpr64: return LLT::scalar(64); + case Sgpr128: + case Vgpr128: +return LLT::scalar(128); case VgprP0: return LLT::pointer(0, 64); case SgprP1: @@ -646,6 +649,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case Sgpr16: case Sgpr32: case Sgpr64: + case Sgpr128: case SgprP1: case SgprP3: case SgprP4: @@ -678,6 +682,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case Vgpr16: case Vgpr32: case Vgpr64: + case Vgpr128: case VgprP0: case VgprP1: case VgprP3: @@ -718,6 +723,7 @@ void RegBankLegalizeHelper::applyMappingDst( case Sgpr16: case Sgpr32: case Sgpr64: +case Sgpr128: case SgprP1: case SgprP3: case SgprP4: @@ -728,6 +734,7 @@ void RegBankLegalizeHelper::applyMappingDst( case Vgpr16: case Vgpr32: case Vgpr64: +case Vgpr128: case VgprP0: case VgprP1: case VgprP3: @@ -839,6 +846,7 @@ void RegBankLegalizeHelper::applyMappingSrc( case Sgpr16: case Sgpr32: case Sgpr64: +case Sgpr128: case SgprP1: case SgprP3: case SgprP4: @@ -865,6 +873,7 @@ void RegBankLegalizeHelper::applyMappingSrc( case Vgpr16: case Vgpr32: case Vgpr64: +case Vgpr128: case VgprP0: case VgprP1: case VgprP3: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 5e21f44f7d545..672fc5b79abc2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -50,6 +50,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::scalar(32); case S64: return MRI.getType(Reg) == LLT::scalar(64); + case S128: +return MRI.getType(Reg) == LLT::scalar(128); case P0: return MRI.getType(Reg) == LLT::pointer(0, 64); case P1: @@ -84,6 +86,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg); case UniS64: return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg); + case UniS128: +return MRI.getType(Reg) == LLT::scalar(128) && MUI.isUniform(Reg); case UniP0: return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg); case UniP1: @@ -116,6 +120,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg); case DivS64: return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg); + case DivS128: +return MRI.getType(Reg) == LLT::scalar(128) && MUI.isDivergent(Reg); case DivP0: return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg); case DivP1: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index bddfb8dd1913f..30b900d871f3c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -39,16 +39,19 @@ enum UniformityLLTOpPredicateID { S16, S32, S64, + S128, UniS1, UniS16, UniS32, UniS64, + UniS128, DivS1, DivS16, DivS32, DivS64, + DivS128, // pointers P0, @@ -117,6 +120,7 @@ enum RegBankLLTMappingApplyID { Sgpr16, Sgpr32, Sgpr64, + Sgpr128, SgprP1, SgprP3, SgprP4, @@ -135,6 +139,7 @@ enum RegBankLLTMappingApplyID { Vgpr16, Vgpr32, Vgpr64, + Vgpr128, VgprP0, VgprP1, VgprP3, ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] New RegBankSelect: Add rules for `G_PTRTOINT` and `G_INTTOPTR` (PR #142604)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/142604 >From 9fd34f632f194a025669b2c2c0f83d19fb48b00c Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 3 Jun 2025 15:08:06 +0200 Subject: [PATCH 1/3] [AMDGPU] New RegBankSelect: Add rules for `G_PTRTOINT` and `G_INTTOPTR` --- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 16 ++- .../GlobalISel/regbankselect-inttoptr.mir | 98 +++ .../GlobalISel/regbankselect-ptrtoint.mir | 98 +++ 3 files changed, 211 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 5402129e41887..61d7d084b21da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -718,7 +718,21 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}) .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}}); - addRulesForGOpcs({G_INTTOPTR}).Any({{UniP4}, {{SgprP4}, {Sgpr64}}}); + addRulesForGOpcs({G_INTTOPTR}) + .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}}) + .Any({{DivPtr32}, {{VgprPtr32}, {Vgpr32}}}) + .Any({{UniPtr64}, {{SgprPtr64}, {Sgpr64}}}) + .Any({{DivPtr64}, {{VgprPtr64}, {Vgpr64}}}) + .Any({{UniPtr128}, {{SgprPtr128}, {Sgpr128}}}) + .Any({{DivPtr128}, {{VgprPtr128}, {Vgpr128}}}); + + addRulesForGOpcs({G_PTRTOINT}) + .Any({{UniS32}, {{Sgpr32}, {SgprPtr32}}}) + .Any({{DivS32}, {{Vgpr32}, {VgprPtr32}}}) + .Any({{UniS64}, {{Sgpr64}, {SgprPtr64}}}) + .Any({{DivS64}, {{Vgpr64}, {VgprPtr64}}}) + .Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}}) + .Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}}); addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir index 42600d7d0dd7a..d9b1b6e20089b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir @@ -2,6 +2,8 @@ # RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s # RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s --check-prefix=NEW_RBS + --- name: inttoptr_s_p0 legalized: true @@ -14,6 +16,12 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p0) = G_INTTOPTR [[COPY]](s64) +; +; NEW_RBS-LABEL: name: inttoptr_s_p0 +; NEW_RBS: liveins: $sgpr0_sgpr1 +; NEW_RBS-NEXT: {{ $}} +; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 +; NEW_RBS-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p0) = G_INTTOPTR [[COPY]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 %1:_(p0) = G_INTTOPTR %0 ... @@ -30,6 +38,12 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p0) = G_INTTOPTR [[COPY]](s64) +; +; NEW_RBS-LABEL: name: inttoptr_v_p0 +; NEW_RBS: liveins: $vgpr0_vgpr1 +; NEW_RBS-NEXT: {{ $}} +; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 +; NEW_RBS-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p0) = G_INTTOPTR [[COPY]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(p0) = G_INTTOPTR %0 ... @@ -46,6 +60,12 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p1) = G_INTTOPTR [[COPY]](s64) +; +; NEW_RBS-LABEL: name: inttoptr_s_p1 +; NEW_RBS: liveins: $sgpr0_sgpr1 +; NEW_RBS-NEXT: {{ $}} +; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 +; NEW_RBS-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p1) = G_INTTOPTR [[COPY]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 %1:_(p1) = G_INTTOPTR %0 ... @@ -62,6 +82,12 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p1) = G_INTTOPTR [[COPY]](s64) +; +; NEW_RBS-LABEL: name: inttoptr_v_p1 +; NEW_RBS: liveins: $vgpr0_vgpr1 +; NEW_RBS-NEXT: {{ $}} +; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 +; NEW_RBS-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p1) = G_INTTOPTR [[COPY]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(p1) = G_INTTOPTR %0 ... @@ -78,6 +104,12 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p2) = G_INTTOPTR [[COPY]](s32) +; +
[llvm-branch-commits] [llvm] [LoopVectorizer] Bundle partial reductions with different extensions (PR #136997)
SamTebbs33 wrote: Really sorry for the spam again, I pushed to the user branch in my fork rather than the base branch in llvm :facepalm: https://github.com/llvm/llvm-project/pull/136997 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis (PR #142777)
https://github.com/ritter-x2a created https://github.com/llvm/llvm-project/pull/142777 Pre-committing test to show improvements in a follow-up PR. >From 1fe91cbd5d3a3f8baa59eb389936f92b0a49ab6c Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 09:30:34 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis Pre-committing test to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 29 +++ 1 file changed, 29 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 656003f45c54b..bce59307446ce 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -132,3 +132,32 @@ declare noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() declare i64 @llvm.amdgcn.dispatch.id() declare noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() + +; Taken from memcpy-param-combinations.ll, tests PTRADD handling in +; SelectionDAGAddressAnalysis. +define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { +; GFX942_PTRADD-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[4:5], v[2:3], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[4:5], off +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[2:3], v[2:3], off offset:8 +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[2:3], off offset:8 +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:global_load_dwordx4 v[2:5], v[2:3], off +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:global_store_dwordx4 v[0:1], v[2:5], off +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) + ret void +} + ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Handle ISD::PTRADD in SelectionDAGAddressAnalysis (PR #142778)
https://github.com/ritter-x2a created https://github.com/llvm/llvm-project/pull/142778 This is used in a bunch of memory-related transforms. For SWDEV-516125. >From 269663e10674813074ccd9645b431fed0287a405 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 09:48:02 -0400 Subject: [PATCH] [AMDGPU][SDAG] Handle ISD::PTRADD in SelectionDAGAddressAnalysis This is used in a bunch of memory-related transforms. For SWDEV-516125. --- .../SelectionDAGAddressAnalysis.cpp | 6 ++-- .../AMDGPU/ptradd-sdag-optimizations.ll | 28 ++- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index f2ab88851b780..da92aaa860b2b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -231,6 +231,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, } break; case ISD::ADD: +case ISD::PTRADD: if (auto *C = dyn_cast(Base->getOperand(1))) { Offset += C->getSExtValue(); Base = DAG.getTargetLoweringInfo().unwrapAddress(Base->getOperand(0)); @@ -259,7 +260,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, break; } - if (Base->getOpcode() == ISD::ADD) { + if (Base->isAnyAdd()) { // TODO: The following code appears to be needless as it just // bails on some Ptrs early, reducing the cases where we // find equivalence. We should be able to remove this. @@ -282,8 +283,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, } // Check if Index Offset pattern -if (Index->getOpcode() != ISD::ADD || -!isa(Index->getOperand(1))) +if (!Index->isAnyAdd() || !isa(Index->getOperand(1))) return BaseIndexOffset(PotentialBase, Index, Offset, IsIndexSignExt); Offset += cast(Index->getOperand(1))->getSExtValue(); diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index bce59307446ce..1069339774894 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -136,26 +136,14 @@ declare noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() ; Taken from memcpy-param-combinations.ll, tests PTRADD handling in ; SelectionDAGAddressAnalysis. define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { -; GFX942_PTRADD-LABEL: memcpy_p1_p4_sz16_align_1_1: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT:global_load_dwordx2 v[4:5], v[2:3], off -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[4:5], off -; GFX942_PTRADD-NEXT:global_load_dwordx2 v[2:3], v[2:3], off offset:8 -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[2:3], off offset:8 -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: memcpy_p1_p4_sz16_align_1_1: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT:global_load_dwordx4 v[2:5], v[2:3], off -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT:global_store_dwordx4 v[0:1], v[2:5], off -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] +; GFX942-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:global_load_dwordx4 v[2:5], v[2:3], off +; GFX942-NEXT:s_waitcnt vmcnt(0) +; GFX942-NEXT:global_store_dwordx4 v[0:1], v[2:5], off +; GFX942-NEXT:s_waitcnt vmcnt(0) +; GFX942-NEXT:s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) ret void ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Handle ISD::PTRADD in SelectionDAGAddressAnalysis (PR #142778)
ritter-x2a wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142778?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142778** https://app.graphite.dev/github/pr/llvm/llvm-project/142778?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142778?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142777** https://app.graphite.dev/github/pr/llvm/llvm-project/142777?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142739** https://app.graphite.dev/github/pr/llvm/llvm-project/142739?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142738** https://app.graphite.dev/github/pr/llvm/llvm-project/142738?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#141725** https://app.graphite.dev/github/pr/llvm/llvm-project/141725?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142778 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis (PR #142777)
ritter-x2a wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142777?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142778** https://app.graphite.dev/github/pr/llvm/llvm-project/142778?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142777** https://app.graphite.dev/github/pr/llvm/llvm-project/142777?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142777?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142739** https://app.graphite.dev/github/pr/llvm/llvm-project/142739?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142738** https://app.graphite.dev/github/pr/llvm/llvm-project/142738?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#141725** https://app.graphite.dev/github/pr/llvm/llvm-project/141725?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142777 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis (PR #142777)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Fabian Ritter (ritter-x2a) Changes Pre-committing test to show improvements in a follow-up PR. --- Full diff: https://github.com/llvm/llvm-project/pull/142777.diff 1 Files Affected: - (modified) llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll (+29) ``diff diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 656003f45c54b..bce59307446ce 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -132,3 +132,32 @@ declare noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() declare i64 @llvm.amdgcn.dispatch.id() declare noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() + +; Taken from memcpy-param-combinations.ll, tests PTRADD handling in +; SelectionDAGAddressAnalysis. +define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { +; GFX942_PTRADD-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[4:5], v[2:3], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[4:5], off +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[2:3], v[2:3], off offset:8 +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[2:3], off offset:8 +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:global_load_dwordx4 v[2:5], v[2:3], off +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:global_store_dwordx4 v[0:1], v[2:5], off +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) + ret void +} + `` https://github.com/llvm/llvm-project/pull/142777 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis (PR #142777)
https://github.com/ritter-x2a ready_for_review https://github.com/llvm/llvm-project/pull/142777 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Handle ISD::PTRADD in SelectionDAGAddressAnalysis (PR #142778)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Fabian Ritter (ritter-x2a) Changes This is used in a bunch of memory-related transforms. For SWDEV-516125. --- Full diff: https://github.com/llvm/llvm-project/pull/142778.diff 2 Files Affected: - (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp (+3-3) - (modified) llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll (+8-20) ``diff diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index f2ab88851b780..da92aaa860b2b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -231,6 +231,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, } break; case ISD::ADD: +case ISD::PTRADD: if (auto *C = dyn_cast(Base->getOperand(1))) { Offset += C->getSExtValue(); Base = DAG.getTargetLoweringInfo().unwrapAddress(Base->getOperand(0)); @@ -259,7 +260,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, break; } - if (Base->getOpcode() == ISD::ADD) { + if (Base->isAnyAdd()) { // TODO: The following code appears to be needless as it just // bails on some Ptrs early, reducing the cases where we // find equivalence. We should be able to remove this. @@ -282,8 +283,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, } // Check if Index Offset pattern -if (Index->getOpcode() != ISD::ADD || -!isa(Index->getOperand(1))) +if (!Index->isAnyAdd() || !isa(Index->getOperand(1))) return BaseIndexOffset(PotentialBase, Index, Offset, IsIndexSignExt); Offset += cast(Index->getOperand(1))->getSExtValue(); diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index bce59307446ce..1069339774894 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -136,26 +136,14 @@ declare noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() ; Taken from memcpy-param-combinations.ll, tests PTRADD handling in ; SelectionDAGAddressAnalysis. define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { -; GFX942_PTRADD-LABEL: memcpy_p1_p4_sz16_align_1_1: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT:global_load_dwordx2 v[4:5], v[2:3], off -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[4:5], off -; GFX942_PTRADD-NEXT:global_load_dwordx2 v[2:3], v[2:3], off offset:8 -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[2:3], off offset:8 -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: memcpy_p1_p4_sz16_align_1_1: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT:global_load_dwordx4 v[2:5], v[2:3], off -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT:global_store_dwordx4 v[0:1], v[2:5], off -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] +; GFX942-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:global_load_dwordx4 v[2:5], v[2:3], off +; GFX942-NEXT:s_waitcnt vmcnt(0) +; GFX942-NEXT:global_store_dwordx4 v[0:1], v[2:5], off +; GFX942-NEXT:s_waitcnt vmcnt(0) +; GFX942-NEXT:s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) ret void `` https://github.com/llvm/llvm-project/pull/142778 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Handle ISD::PTRADD in SelectionDAGAddressAnalysis (PR #142778)
https://github.com/ritter-x2a ready_for_review https://github.com/llvm/llvm-project/pull/142778 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64][SME] Support split ZPR and PPR area allocation (PR #142392)
@@ -3780,25 +3938,49 @@ void AArch64FrameLowering::determineStackHazardSlot( bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) { return AArch64::FPR64RegClass.contains(Reg) || AArch64::FPR128RegClass.contains(Reg) || - AArch64::ZPRRegClass.contains(Reg) || - AArch64::PPRRegClass.contains(Reg); + AArch64::ZPRRegClass.contains(Reg); + }); + bool HasPPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) { +return AArch64::PPRRegClass.contains(Reg); }); bool HasFPRStackObjects = false; - if (!HasFPRCSRs) { -std::vector FrameObjects(MFI.getObjectIndexEnd()); + bool HasPPRStackObjects = false; + if (!HasFPRCSRs || SplitSVEObjects) { +enum SlotType : uint8_t { + Unknown = 0, + ZPRorFPR = 1 << 0, + PPR = 1 << 1, + GPR = 1 << 2, + LLVM_MARK_AS_BITMASK_ENUM(GPR) +}; + +// Find stack slots solely used for one kind of register (ZPR, PPR, etc.), +// based on the kinds of accesses used in the function. +SmallVector SlotTypes(MFI.getObjectIndexEnd(), SlotType::Unknown); for (auto &MBB : MF) { for (auto &MI : MBB) { std::optional FI = getLdStFrameID(MI, MFI); -if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { - if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI)) -FrameObjects[*FI] |= 2; - else -FrameObjects[*FI] |= 1; +if (!FI || FI < 0 || FI > int(SlotTypes.size())) + continue; +bool IsScalable = MFI.isScalableStackID(*FI); +bool IsPPR = IsScalable && isPPRAccess(MI); +if (IsScalable || AArch64InstrInfo::isFpOrNEON(MI)) { + SlotTypes[*FI] |= IsPPR ? SlotType::PPR : SlotType::ZPRorFPR; +} else { + SlotTypes[*FI] |= SlotType::GPR; } } } -HasFPRStackObjects = -any_of(FrameObjects, [](unsigned B) { return (B & 3) == 2; }); + +for (int FI = 0; FI < int(SlotTypes.size()); ++FI) { + HasFPRStackObjects |= SlotTypes[FI] == SlotType::ZPRorFPR; + // For SplitSVEObjects remember that this stack slot is a predicate, this + // will be needed later when determining the frame layout. + if (SlotTypes[FI] == SlotType::PPR) { MacDue wrote: Which bit does not look right? `SlotType::PPR` (not to be confused with `SlotType::GPR`) is only set if the original stack ID was scalable and all accesses to that slot used predicate load/stores. The original stack ID could be `ScalableVector`, as the earlier selection of the stack ID is only based on the type size. Therefore, we change it to `ScalablePredVector` here so that it can be sorted into the correct region. https://github.com/llvm/llvm-project/pull/142392 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CI] Use LLVM_ENABLE_RUNTIMES for runtimes builds on Linux (PR #142694)
https://github.com/DavidSpickett approved this pull request. CMake once with the runtimes enabled. Modify the test parameters each time. Looks good to me. https://github.com/llvm/llvm-project/pull/142694 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [Flang][OpenMP] - When mapping a `fir.boxchar`, map the underlying data pointer as a member (PR #141715)
@@ -285,6 +285,62 @@ class MapInfoFinalizationPass return false; } + mlir::omp::MapInfoOp genBoxcharMemberMap(mlir::omp::MapInfoOp op, bhandarkar-pranav wrote: Thank you, @agozillon for the review. Yes, it was related to the fact that `genDescriptorMemberMaps` is tied quite tightly to `BaseBoxType`. So, I decided to not mess with it. I should add it to my to-do list though to generalize `genDescriptorMemberMaps`. https://github.com/llvm/llvm-project/pull/141715 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang] Enable delayed localization by default for `do concurrent` (PR #142567)
https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/142567 >From fd943c277c419105055caf92180f641550b747f3 Mon Sep 17 00:00:00 2001 From: ergawy Date: Tue, 3 Jun 2025 04:22:20 -0500 Subject: [PATCH] [flang] Enable delayed localization by default for `do concurrent` --- flang/lib/Lower/Bridge.cpp| 6 +- flang/test/Lower/do_concurrent_delayed_locality.f90 | 2 +- flang/test/Lower/do_concurrent_local_assoc_entity.f90 | 2 +- flang/test/Lower/do_concurrent_local_default_init.f90 | 2 +- flang/test/Lower/loops.f90| 2 +- flang/test/Lower/loops3.f90 | 2 +- 6 files changed, 6 insertions(+), 10 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 64b16b3abe991..5ff8101dba097 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -2033,11 +2033,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { fir::LocalitySpecifierOperands privateClauseOps; auto doConcurrentLoopOp = mlir::dyn_cast_if_present(info.loopOp); -// TODO Promote to using `enableDelayedPrivatization` (which is enabled by -// default unlike the staging flag) once the implementation of this is more -// complete. -bool useDelayedPriv = -enableDelayedPrivatizationStaging && doConcurrentLoopOp; +bool useDelayedPriv = enableDelayedPrivatization && doConcurrentLoopOp; llvm::SetVector allPrivatizedSymbols; llvm::SmallSet mightHaveReadHostSym; diff --git a/flang/test/Lower/do_concurrent_delayed_locality.f90 b/flang/test/Lower/do_concurrent_delayed_locality.f90 index 6cae0eb46db13..039b17808d19e 100644 --- a/flang/test/Lower/do_concurrent_delayed_locality.f90 +++ b/flang/test/Lower/do_concurrent_delayed_locality.f90 @@ -1,4 +1,4 @@ -! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s subroutine do_concurrent_with_locality_specs implicit none diff --git a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 b/flang/test/Lower/do_concurrent_local_assoc_entity.f90 index 3d4f97e482e23..c941328e3195a 100644 --- a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 +++ b/flang/test/Lower/do_concurrent_local_assoc_entity.f90 @@ -1,4 +1,4 @@ -! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s subroutine local_assoc implicit none diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90 index d643213854744..798cbb335c8c0 100644 --- a/flang/test/Lower/do_concurrent_local_default_init.f90 +++ b/flang/test/Lower/do_concurrent_local_default_init.f90 @@ -1,5 +1,5 @@ ! Test default initialization of DO CONCURRENT LOCAL() entities. -! RUN: bbc -emit-hlfir --enable-delayed-privatization-staging=true -I nowhere -o - %s | FileCheck %s +! RUN: bbc -emit-hlfir -I nowhere -o - %s | FileCheck %s subroutine test_ptr(p) interface diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90 index 60df27a591dc3..64f14ff972272 100644 --- a/flang/test/Lower/loops.f90 +++ b/flang/test/Lower/loops.f90 @@ -1,4 +1,4 @@ -! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s +! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s ! CHECK-LABEL: loop_test subroutine loop_test diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90 index 84db1972cca16..34d7bcfb7d7ad 100644 --- a/flang/test/Lower/loops3.f90 +++ b/flang/test/Lower/loops3.f90 @@ -1,5 +1,5 @@ ! Test do concurrent reduction -! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s +! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s ! CHECK-LABEL: loop_test subroutine loop_test ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][OpenMP] Map basic `local` specifiers to `private` clauses (PR #142735)
https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/142735 >From 62596cdddcbda71b2a1b306cedd8de238510b105 Mon Sep 17 00:00:00 2001 From: ergawy Date: Wed, 4 Jun 2025 00:54:37 -0500 Subject: [PATCH] [flang][OpenMP] Map basic `local` specifiers to `private` clauses Starts the effort to map `do concurrent` locality specifiers to OpenMP clauses. This PR adds support for basic specifiers (no `init` or `copy` regions yet). --- .../OpenMP/DoConcurrentConversion.cpp | 55 ++- .../locality_specifiers_simple.mlir | 48 2 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 0fdb302fe10ca..283c3052c166c 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -7,9 +7,11 @@ //===--===// #include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/OpenMP/Passes.h" #include "flang/Optimizer/OpenMP/Utils.h" +#include "flang/Support/OpenMP-utils.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/IRMapping.h" @@ -308,10 +310,47 @@ class DoConcurrentConversion fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper, const mlir::omp::LoopNestOperands &clauseOps, bool isComposite) const { +mlir::omp::WsloopOperands wsloopClauseOps; + +// For `local` (and `local_init`) opernads, emit corresponding `private` +// clauses and attach these clauses to the workshare loop. +if (!loop.getLocalOperands().empty()) + for (auto [op, sym, arg] : llvm::zip_equal( + loop.getLocalOperands(), + loop.getLocalSymsAttr().getAsRange(), + loop.getRegionLocalArgs())) { +auto localizer = mlir::SymbolTable::lookupNearestSymbolFrom< +fir::LocalitySpecifierOp>(loop, sym); +if (localizer.getLocalitySpecifierType() == +fir::LocalitySpecifierType::LocalInit) + TODO(localizer.getLoc(), + "local_init conversion is not supported yet"); + +if (!localizer.getInitRegion().empty()) + TODO(localizer.getLoc(), + "non-empty `init` regions are not supported yet"); + +auto oldIP = rewriter.saveInsertionPoint(); +rewriter.setInsertionPointAfter(localizer); +auto privatizer = rewriter.create( +localizer.getLoc(), sym.getLeafReference().str() + ".omp", +localizer.getTypeAttr().getValue(), +mlir::omp::DataSharingClauseType::Private); +rewriter.restoreInsertionPoint(oldIP); + +wsloopClauseOps.privateVars.push_back(op); +wsloopClauseOps.privateSyms.push_back( +mlir::SymbolRefAttr::get(privatizer)); + } -auto wsloopOp = rewriter.create(loop.getLoc()); +auto wsloopOp = +rewriter.create(loop.getLoc(), wsloopClauseOps); wsloopOp.setComposite(isComposite); -rewriter.createBlock(&wsloopOp.getRegion()); + +Fortran::common::openmp::EntryBlockArgs wsloopArgs; +wsloopArgs.priv.vars = wsloopClauseOps.privateVars; +Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs, + wsloopOp.getRegion()); auto loopNestOp = rewriter.create(loop.getLoc(), clauseOps); @@ -324,6 +363,18 @@ class DoConcurrentConversion rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back()); rewriter.create(loop->getLoc()); +// `local` region arguments are transferred/cloned from the `do concurrent` +// loop to the loopnest op when the region is cloned above. Instead, these +// region arguments should be on the workshare loop's region. +for (auto [wsloopArg, loopNestArg] : + llvm::zip_equal(wsloopOp.getRegion().getArguments(), + loopNestOp.getRegion().getArguments().drop_front( + clauseOps.loopLowerBounds.size( + rewriter.replaceAllUsesWith(loopNestArg, wsloopArg); + +for (unsigned i = 0; i < loop.getLocalVars().size(); ++i) + loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size()); + return loopNestOp; } diff --git a/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir b/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir new file mode 100644 index 0..160c1df040680 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir @@ -0,0 +1,48 @@ +// Tests mapping `local` locality specifier to `private` clauses for a simple +// c
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #142789)
https://github.com/petar-avramovic created https://github.com/llvm/llvm-project/pull/142789 None >From fcd0dc75f4674297ef1f5c591ecf6c16314ce3e2 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 4 Jun 2025 17:12:16 +0200 Subject: [PATCH] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize --- .../Target/AMDGPU/AMDGPURegBankLegalize.cpp | 136 +++--- .../AMDGPU/GlobalISel/readanylane-combines.ll | 25 +--- .../GlobalISel/readanylane-combines.mir | 78 +++--- 3 files changed, 139 insertions(+), 100 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index ba661348ca5b5..b5fe0ed499255 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -23,6 +23,7 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -137,7 +138,123 @@ class AMDGPURegBankLegalizeCombiner { return {MatchMI, MatchMI->getOperand(1).getReg()}; } + std::tuple tryMatchRALFromUnmerge(Register Src) { +auto *ReadAnyLane = MRI.getVRegDef(Src); +if (ReadAnyLane->getOpcode() == AMDGPU::G_AMDGPU_READANYLANE) { + Register RALSrc = ReadAnyLane->getOperand(1).getReg(); + auto *UnMerge = getOpcodeDef(RALSrc, MRI); + if (UnMerge) +return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; +} +return {nullptr, -1}; + } + + Register getReadAnyLaneSrc(Register Src) { +// Src = G_AMDGPU_READANYLANE RALSrc +auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); +if (RAL) + return RALSrc; + +// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc +// LoSgpr = G_AMDGPU_READANYLANE LoVgpr +// HiSgpr = G_AMDGPU_READANYLANE HiVgpr +// Src G_MERGE_VALUES LoSgpr, HiSgpr +auto *Merge = getOpcodeDef(Src, MRI); +if (Merge) { + unsigned NumElts = Merge->getNumSources(); + auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); + if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) +return {}; + + // check if all elements are from same unmerge and there is no shuffling + for (unsigned i = 1; i < NumElts; ++i) { +auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); +if (UnmergeI != Unmerge || (unsigned)IdxI != i) + return {}; + } + return Unmerge->getSourceReg(); +} + +// ..., VgprI, ... = G_UNMERGE_VALUES VgprLarge +// SgprI = G_AMDGPU_READANYLANE VgprI +// SgprLarge G_MERGE_VALUES ..., SgprI, ... +// ..., Src, ... = G_UNMERGE_VALUES SgprLarge +auto *UnMerge = getOpcodeDef(Src, MRI); +if (UnMerge) { + int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr); + auto *Merge = getOpcodeDef(UnMerge->getSourceReg(), MRI); + if (Merge) { +auto [RAL, RALSrc] = +tryMatch(Merge->getSourceReg(Idx), AMDGPU::G_AMDGPU_READANYLANE); +if (RAL) + return RALSrc; + } +} + +return {}; + } + + bool tryEliminateReadAnyLane(MachineInstr &Copy) { +Register Dst = Copy.getOperand(0).getReg(); +Register Src = Copy.getOperand(1).getReg(); +if (!Src.isVirtual()) + return false; + +Register RALDst = Src; +MachineInstr &SrcMI = *MRI.getVRegDef(Src); +if (SrcMI.getOpcode() == AMDGPU::G_BITCAST) { + RALDst = SrcMI.getOperand(1).getReg(); +} + +Register RALSrc = getReadAnyLaneSrc(RALDst); +if (!RALSrc) + return false; + +if (Dst.isVirtual()) { + if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { +// Src = READANYLANE RALSrc +// Dst = Copy Src +// -> +// Dst = RALSrc +MRI.replaceRegWith(Dst, RALSrc); + } else { +// RALDst = READANYLANE RALSrc +// Src = G_BITCAST RALDst +// Dst = Copy Src +// -> +// NewVgpr = G_BITCAST RALDst +// Dst = NewVgpr +auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); +MRI.replaceRegWith(Dst, Bitcast.getReg(0)); + } +} else { + B.setInstr(Copy); + if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { +// Src = READANYLANE RALSrc +// $Dst = Copy Src +// -> +// $Dst = Copy RALSrc +B.buildCopy(Dst, RALSrc); + } else { +// RALDst = READANYLANE RALSrc +// Src = G_BITCAST RALDst +// Dst = Copy Src +// -> +// NewVgpr = G_BITCAST RALDst +// $Dst = Copy NewVgpr +auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); +B.buildCopy(Dst, Bitcast.getReg(0)); + } +} + +eraseInstr(Copy, MRI, nullptr);
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #142789)
petar-avramovic wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142789?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142790** https://app.graphite.dev/github/pr/llvm/llvm-project/142790?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142789** https://app.graphite.dev/github/pr/llvm/llvm-project/142789?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142789?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142788** https://app.graphite.dev/github/pr/llvm/llvm-project/142788?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142789 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize (PR #142790)
petar-avramovic wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142790?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142790** https://app.graphite.dev/github/pr/llvm/llvm-project/142790?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142790?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142789** https://app.graphite.dev/github/pr/llvm/llvm-project/142789?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142788** https://app.graphite.dev/github/pr/llvm/llvm-project/142788?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142790 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #142789)
https://github.com/petar-avramovic ready_for_review https://github.com/llvm/llvm-project/pull/142789 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize (PR #142790)
llvmbot wrote: @llvm/pr-subscribers-llvm-globalisel Author: Petar Avramovic (petar-avramovic) Changes Add rules for G_AMDGPU_BUFFER_LOAD and implement waterfall lowering for divergent operands that must be sgpr. --- Patch is 89.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142790.diff 17 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp (+41-13) - (modified) llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h (+2) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp (+238-2) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+18-10) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h (+2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-schedule.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll (+28-31) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll (+28-31) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll (+28-31) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll (+28-31) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll (+22-20) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll (+22-20) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll (+25-23) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll (+25-23) - (modified) llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll (+2-2) ``diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index 00979f44f9d34..b3edb959e14c3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -117,45 +117,73 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) { return LLT::scalar(32); } -static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, - const RegisterBankInfo &RBI); +typedef std::function +ReadLaneFnTy; + +static Register buildReadLane(MachineIRBuilder &, Register, + const RegisterBankInfo &, ReadLaneFnTy); static void unmergeReadAnyLane(MachineIRBuilder &B, SmallVectorImpl &SgprDstParts, LLT UnmergeTy, Register VgprSrc, - const RegisterBankInfo &RBI) { + const RegisterBankInfo &RBI, + ReadLaneFnTy BuildRL) { const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID); auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc); for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { -SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI)); +SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL)); } } -static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, - const RegisterBankInfo &RBI) { +static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc, + const RegisterBankInfo &RBI, + ReadLaneFnTy BuildRL) { LLT Ty = B.getMRI()->getType(VgprSrc); const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID); if (Ty.getSizeInBits() == 32) { -return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc}) -.getReg(0); +Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty}); +return BuildRL(B, SgprDst, VgprSrc).getReg(0); } SmallVector SgprDstParts; - unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); + unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI, + BuildRL); return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0); } -void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, - Register VgprSrc, const RegisterBankInfo &RBI) { +static void buildReadLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI, + ReadLaneFnTy BuildReadLane) { LLT Ty = B.getMRI()->getType(VgprSrc); if (Ty.getSizeInBits() == 32) { -B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc}); +BuildReadLane(B, SgprDst, VgprSrc); return; } SmallVector SgprDstParts; - unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); + unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSr
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #142789)
llvmbot wrote: @llvm/pr-subscribers-llvm-globalisel Author: Petar Avramovic (petar-avramovic) Changes --- Patch is 22.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142789.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp (+118-18) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll (+2-23) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir (+19-59) ``diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index ba661348ca5b5..b5fe0ed499255 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -23,6 +23,7 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -137,7 +138,123 @@ class AMDGPURegBankLegalizeCombiner { return {MatchMI, MatchMI->getOperand(1).getReg()}; } + std::tuple tryMatchRALFromUnmerge(Register Src) { +auto *ReadAnyLane = MRI.getVRegDef(Src); +if (ReadAnyLane->getOpcode() == AMDGPU::G_AMDGPU_READANYLANE) { + Register RALSrc = ReadAnyLane->getOperand(1).getReg(); + auto *UnMerge = getOpcodeDef(RALSrc, MRI); + if (UnMerge) +return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; +} +return {nullptr, -1}; + } + + Register getReadAnyLaneSrc(Register Src) { +// Src = G_AMDGPU_READANYLANE RALSrc +auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); +if (RAL) + return RALSrc; + +// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc +// LoSgpr = G_AMDGPU_READANYLANE LoVgpr +// HiSgpr = G_AMDGPU_READANYLANE HiVgpr +// Src G_MERGE_VALUES LoSgpr, HiSgpr +auto *Merge = getOpcodeDef(Src, MRI); +if (Merge) { + unsigned NumElts = Merge->getNumSources(); + auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); + if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) +return {}; + + // check if all elements are from same unmerge and there is no shuffling + for (unsigned i = 1; i < NumElts; ++i) { +auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); +if (UnmergeI != Unmerge || (unsigned)IdxI != i) + return {}; + } + return Unmerge->getSourceReg(); +} + +// ..., VgprI, ... = G_UNMERGE_VALUES VgprLarge +// SgprI = G_AMDGPU_READANYLANE VgprI +// SgprLarge G_MERGE_VALUES ..., SgprI, ... +// ..., Src, ... = G_UNMERGE_VALUES SgprLarge +auto *UnMerge = getOpcodeDef(Src, MRI); +if (UnMerge) { + int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr); + auto *Merge = getOpcodeDef(UnMerge->getSourceReg(), MRI); + if (Merge) { +auto [RAL, RALSrc] = +tryMatch(Merge->getSourceReg(Idx), AMDGPU::G_AMDGPU_READANYLANE); +if (RAL) + return RALSrc; + } +} + +return {}; + } + + bool tryEliminateReadAnyLane(MachineInstr &Copy) { +Register Dst = Copy.getOperand(0).getReg(); +Register Src = Copy.getOperand(1).getReg(); +if (!Src.isVirtual()) + return false; + +Register RALDst = Src; +MachineInstr &SrcMI = *MRI.getVRegDef(Src); +if (SrcMI.getOpcode() == AMDGPU::G_BITCAST) { + RALDst = SrcMI.getOperand(1).getReg(); +} + +Register RALSrc = getReadAnyLaneSrc(RALDst); +if (!RALSrc) + return false; + +if (Dst.isVirtual()) { + if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { +// Src = READANYLANE RALSrc +// Dst = Copy Src +// -> +// Dst = RALSrc +MRI.replaceRegWith(Dst, RALSrc); + } else { +// RALDst = READANYLANE RALSrc +// Src = G_BITCAST RALDst +// Dst = Copy Src +// -> +// NewVgpr = G_BITCAST RALDst +// Dst = NewVgpr +auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); +MRI.replaceRegWith(Dst, Bitcast.getReg(0)); + } +} else { + B.setInstr(Copy); + if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { +// Src = READANYLANE RALSrc +// $Dst = Copy Src +// -> +// $Dst = Copy RALSrc +B.buildCopy(Dst, RALSrc); + } else { +// RALDst = READANYLANE RALSrc +// Src = G_BITCAST RALDst +// Dst = Copy Src +// -> +// NewVgpr = G_BITCAST RALDst +// $Dst = Copy NewVgpr +auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); +B.buildCopy(Dst, Bitcast.getReg(0)); + } +} + +eraseInstr(Copy, MRI, nullptr); +return true; + } + void tryCombineCopy(Machine
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #142789)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Petar Avramovic (petar-avramovic) Changes --- Patch is 22.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142789.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp (+118-18) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll (+2-23) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir (+19-59) ``diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index ba661348ca5b5..b5fe0ed499255 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -23,6 +23,7 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -137,7 +138,123 @@ class AMDGPURegBankLegalizeCombiner { return {MatchMI, MatchMI->getOperand(1).getReg()}; } + std::tuple tryMatchRALFromUnmerge(Register Src) { +auto *ReadAnyLane = MRI.getVRegDef(Src); +if (ReadAnyLane->getOpcode() == AMDGPU::G_AMDGPU_READANYLANE) { + Register RALSrc = ReadAnyLane->getOperand(1).getReg(); + auto *UnMerge = getOpcodeDef(RALSrc, MRI); + if (UnMerge) +return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; +} +return {nullptr, -1}; + } + + Register getReadAnyLaneSrc(Register Src) { +// Src = G_AMDGPU_READANYLANE RALSrc +auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); +if (RAL) + return RALSrc; + +// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc +// LoSgpr = G_AMDGPU_READANYLANE LoVgpr +// HiSgpr = G_AMDGPU_READANYLANE HiVgpr +// Src G_MERGE_VALUES LoSgpr, HiSgpr +auto *Merge = getOpcodeDef(Src, MRI); +if (Merge) { + unsigned NumElts = Merge->getNumSources(); + auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); + if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) +return {}; + + // check if all elements are from same unmerge and there is no shuffling + for (unsigned i = 1; i < NumElts; ++i) { +auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); +if (UnmergeI != Unmerge || (unsigned)IdxI != i) + return {}; + } + return Unmerge->getSourceReg(); +} + +// ..., VgprI, ... = G_UNMERGE_VALUES VgprLarge +// SgprI = G_AMDGPU_READANYLANE VgprI +// SgprLarge G_MERGE_VALUES ..., SgprI, ... +// ..., Src, ... = G_UNMERGE_VALUES SgprLarge +auto *UnMerge = getOpcodeDef(Src, MRI); +if (UnMerge) { + int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr); + auto *Merge = getOpcodeDef(UnMerge->getSourceReg(), MRI); + if (Merge) { +auto [RAL, RALSrc] = +tryMatch(Merge->getSourceReg(Idx), AMDGPU::G_AMDGPU_READANYLANE); +if (RAL) + return RALSrc; + } +} + +return {}; + } + + bool tryEliminateReadAnyLane(MachineInstr &Copy) { +Register Dst = Copy.getOperand(0).getReg(); +Register Src = Copy.getOperand(1).getReg(); +if (!Src.isVirtual()) + return false; + +Register RALDst = Src; +MachineInstr &SrcMI = *MRI.getVRegDef(Src); +if (SrcMI.getOpcode() == AMDGPU::G_BITCAST) { + RALDst = SrcMI.getOperand(1).getReg(); +} + +Register RALSrc = getReadAnyLaneSrc(RALDst); +if (!RALSrc) + return false; + +if (Dst.isVirtual()) { + if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { +// Src = READANYLANE RALSrc +// Dst = Copy Src +// -> +// Dst = RALSrc +MRI.replaceRegWith(Dst, RALSrc); + } else { +// RALDst = READANYLANE RALSrc +// Src = G_BITCAST RALDst +// Dst = Copy Src +// -> +// NewVgpr = G_BITCAST RALDst +// Dst = NewVgpr +auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); +MRI.replaceRegWith(Dst, Bitcast.getReg(0)); + } +} else { + B.setInstr(Copy); + if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { +// Src = READANYLANE RALSrc +// $Dst = Copy Src +// -> +// $Dst = Copy RALSrc +B.buildCopy(Dst, RALSrc); + } else { +// RALDst = READANYLANE RALSrc +// Src = G_BITCAST RALDst +// Dst = Copy Src +// -> +// NewVgpr = G_BITCAST RALDst +// $Dst = Copy NewVgpr +auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); +B.buildCopy(Dst, Bitcast.getReg(0)); + } +} + +eraseInstr(Copy, MRI, nullptr); +return true; + } + void tryCombineCopy(MachineI
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Handle ISD::PTRADD in SelectionDAGAddressAnalysis (PR #142778)
https://github.com/shiltian approved this pull request. https://github.com/llvm/llvm-project/pull/142778 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis (PR #142777)
https://github.com/shiltian approved this pull request. https://github.com/llvm/llvm-project/pull/142777 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -2627,6 +2629,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) shiltian wrote: how about poison? https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize (PR #142790)
https://github.com/petar-avramovic ready_for_review https://github.com/llvm/llvm-project/pull/142790 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][OpenMP] Extend locality spec to OMP claues (`init` and `dealloc` regions) (PR #142795)
llvmbot wrote: @llvm/pr-subscribers-flang-fir-hlfir Author: Kareem Ergawy (ergawy) Changes Extends support for locality specifier to OpenMP translation by adding supprot for transling localizers that have `init` and `dealloc` regions. --- Full diff: https://github.com/llvm/llvm-project/pull/142795.diff 2 Files Affected: - (modified) flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp (+25-4) - (added) flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir (+51) ``diff diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 283c3052c166c..28f6c8bf02813 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -326,16 +326,37 @@ class DoConcurrentConversion TODO(localizer.getLoc(), "local_init conversion is not supported yet"); -if (!localizer.getInitRegion().empty()) - TODO(localizer.getLoc(), - "non-empty `init` regions are not supported yet"); - auto oldIP = rewriter.saveInsertionPoint(); rewriter.setInsertionPointAfter(localizer); auto privatizer = rewriter.create( localizer.getLoc(), sym.getLeafReference().str() + ".omp", localizer.getTypeAttr().getValue(), mlir::omp::DataSharingClauseType::Private); + +if (!localizer.getInitRegion().empty()) { + rewriter.cloneRegionBefore(localizer.getInitRegion(), + privatizer.getInitRegion(), + privatizer.getInitRegion().begin()); + auto firYield = mlir::cast( + privatizer.getInitRegion().back().getTerminator()); + rewriter.setInsertionPoint(firYield); + rewriter.create(firYield.getLoc(), + firYield.getOperands()); + rewriter.eraseOp(firYield); +} + +if (!localizer.getDeallocRegion().empty()) { + rewriter.cloneRegionBefore(localizer.getDeallocRegion(), + privatizer.getDeallocRegion(), + privatizer.getDeallocRegion().begin()); + auto firYield = mlir::cast( + privatizer.getDeallocRegion().back().getTerminator()); + rewriter.setInsertionPoint(firYield); + rewriter.create(firYield.getLoc(), + firYield.getOperands()); + rewriter.eraseOp(firYield); +} + rewriter.restoreInsertionPoint(oldIP); wsloopClauseOps.privateVars.push_back(op); diff --git a/flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir b/flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir new file mode 100644 index 0..a82d8d1715f56 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir @@ -0,0 +1,51 @@ +// Tests mapping `local` locality specifier to `private` clauses for non-empty +// `init` and `dealloc` regions. + +// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s + +func.func @my_allocator() { + return +} + +func.func @my_deallocator() { + return +} + +fir.local {type = local} @_QFlocal_assocEaa_private_box_10xf32 : !fir.box> init { +^bb0(%arg0: !fir.ref>>, %arg1: !fir.ref>>): + fir.call @my_allocator() : () -> () + fir.yield(%arg1 : !fir.ref>>) +} dealloc { +^bb0(%arg0: !fir.ref>>): + fir.call @my_deallocator() : () -> () + fir.yield +} + +func.func @_QPlocal_assoc() { + %0 = fir.alloca !fir.box> + %c1 = arith.constant 1 : index + + fir.do_concurrent { +%9 = fir.alloca i32 {bindc_name = "i"} +%10:2 = hlfir.declare %9 {uniq_name = "_QFlocal_assocEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) local(@_QFlocal_assocEaa_private_box_10xf32 %0 -> %arg1 : !fir.ref>>) { + %11 = fir.convert %arg0 : (index) -> i32 + fir.store %11 to %10#0 : !fir.ref +} + } + + return +} + +// CHECK: omp.private {type = private} @[[PRIVATIZER:.*]] : !fir.box> init { +// CHECK-NEXT: ^bb0(%{{.*}}: !{{.*}}, %{{.*}}: !{{.*}}): +// CHECK-NEXT: fir.call @my_allocator() : () -> () +// CHECK-NEXT: omp.yield(%{{.*}}) +// CHECK-NEXT: } dealloc { +// CHECK-NEXT: ^bb0(%{{.*}}: !{{.*}}): +// CHECK-NEXT: fir.call @my_deallocator() : () -> () +// CHECK-NEXT: omp.yield +// CHECK-NEXT: } + +// CHECK: %[[LOCAL_ALLOC:.*]] = fir.alloca !fir.box> +// CHECK: omp.wsloop private(@[[PRIVATIZER]] %[[LOCAL_ALLOC]] -> %{{.*}} : !{{.*}}) `` https://github.com/llvm/llvm-project/pull/142795 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -14935,6 +14936,52 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performPtrAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N1.getOpcode() == ISD::ADD) { +// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, +//y is not, and (add y, z) is used only once. +// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, +//z is not, and (add y, z) is used only once. +// The goal is to move constant offsets to the outermost ptradd, to create +// more opportunities to fold offsets into memory instructions. +// Together with the generic combines in DAGCombiner.cpp, this also +// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). +// +// This transform is here instead of in the general DAGCombiner as it can +// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for +// AArch64's CPA. shiltian wrote: remove the aarch64 part? https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -2627,6 +2629,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) +return N1; + + if (N0.getOpcode() == ISD::PTRADD && + !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) { +SDValue X = N0.getOperand(0); +SDValue Y = N0.getOperand(1); +SDValue Z = N1; +bool N0OneUse = N0.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + +// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if: +// * y is a constant and (ptradd x, y) has one use; or +// * y and z are both constants. shiltian wrote: for my own education, why is that? https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -14935,6 +14936,52 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performPtrAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N1.getOpcode() == ISD::ADD) { shiltian wrote: bail out early? https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][OpenMP] Extend locality spec to OMP claues (`init` and `dealloc` regions) (PR #142795)
https://github.com/ergawy created https://github.com/llvm/llvm-project/pull/142795 Extends support for locality specifier to OpenMP translation by adding supprot for transling localizers that have `init` and `dealloc` regions. >From 7b331963134cafc47290a45a321e7a62ef8cfce7 Mon Sep 17 00:00:00 2001 From: ergawy Date: Wed, 4 Jun 2025 08:51:46 -0500 Subject: [PATCH] [flang][OpenMP] Extend locality spec to OMP claues (`init` and `dealloc` regions) Extends support for locality specifier to OpenMP translation by adding supprot for transling localizers that have `init` and `dealloc` regions. --- .../OpenMP/DoConcurrentConversion.cpp | 29 +-- .../locality_specifiers_init_dealloc.mlir | 51 +++ 2 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 283c3052c166c..28f6c8bf02813 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -326,16 +326,37 @@ class DoConcurrentConversion TODO(localizer.getLoc(), "local_init conversion is not supported yet"); -if (!localizer.getInitRegion().empty()) - TODO(localizer.getLoc(), - "non-empty `init` regions are not supported yet"); - auto oldIP = rewriter.saveInsertionPoint(); rewriter.setInsertionPointAfter(localizer); auto privatizer = rewriter.create( localizer.getLoc(), sym.getLeafReference().str() + ".omp", localizer.getTypeAttr().getValue(), mlir::omp::DataSharingClauseType::Private); + +if (!localizer.getInitRegion().empty()) { + rewriter.cloneRegionBefore(localizer.getInitRegion(), + privatizer.getInitRegion(), + privatizer.getInitRegion().begin()); + auto firYield = mlir::cast( + privatizer.getInitRegion().back().getTerminator()); + rewriter.setInsertionPoint(firYield); + rewriter.create(firYield.getLoc(), + firYield.getOperands()); + rewriter.eraseOp(firYield); +} + +if (!localizer.getDeallocRegion().empty()) { + rewriter.cloneRegionBefore(localizer.getDeallocRegion(), + privatizer.getDeallocRegion(), + privatizer.getDeallocRegion().begin()); + auto firYield = mlir::cast( + privatizer.getDeallocRegion().back().getTerminator()); + rewriter.setInsertionPoint(firYield); + rewriter.create(firYield.getLoc(), + firYield.getOperands()); + rewriter.eraseOp(firYield); +} + rewriter.restoreInsertionPoint(oldIP); wsloopClauseOps.privateVars.push_back(op); diff --git a/flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir b/flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir new file mode 100644 index 0..a82d8d1715f56 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir @@ -0,0 +1,51 @@ +// Tests mapping `local` locality specifier to `private` clauses for non-empty +// `init` and `dealloc` regions. + +// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s + +func.func @my_allocator() { + return +} + +func.func @my_deallocator() { + return +} + +fir.local {type = local} @_QFlocal_assocEaa_private_box_10xf32 : !fir.box> init { +^bb0(%arg0: !fir.ref>>, %arg1: !fir.ref>>): + fir.call @my_allocator() : () -> () + fir.yield(%arg1 : !fir.ref>>) +} dealloc { +^bb0(%arg0: !fir.ref>>): + fir.call @my_deallocator() : () -> () + fir.yield +} + +func.func @_QPlocal_assoc() { + %0 = fir.alloca !fir.box> + %c1 = arith.constant 1 : index + + fir.do_concurrent { +%9 = fir.alloca i32 {bindc_name = "i"} +%10:2 = hlfir.declare %9 {uniq_name = "_QFlocal_assocEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) local(@_QFlocal_assocEaa_private_box_10xf32 %0 -> %arg1 : !fir.ref>>) { + %11 = fir.convert %arg0 : (index) -> i32 + fir.store %11 to %10#0 : !fir.ref +} + } + + return +} + +// CHECK: omp.private {type = private} @[[PRIVATIZER:.*]] : !fir.box> init { +// CHECK-NEXT: ^bb0(%{{.*}}: !{{.*}}, %{{.*}}: !{{.*}}): +// CHECK-NEXT: fir.call @my_allocator() : () -> () +// CHECK-NEXT: omp.yield(%{{.*}}) +// CHECK-NEXT: } dealloc { +// CHECK-NEXT: ^bb0(%{{.*}}: !{{.*}}): +// CHECK-NEXT: fir.call @my_deallocator() : () -> () +// CHECK-NEXT: omp.yield +// CHECK-NEXT: } + +// CHECK: %[[LOCAL_ALLOC:.*]] = fir.alloca !fir.box> +// CHECK: omp
[llvm-branch-commits] [llvm] [AMDGPU] New RegBankSelect: Add rules for `G_PTRTOINT` and `G_INTTOPTR` (PR #142604)
https://github.com/petar-avramovic approved this pull request. https://github.com/llvm/llvm-project/pull/142604 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -2627,6 +2629,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) +return N1; + + if (N0.getOpcode() == ISD::PTRADD && shiltian wrote: I think you can early bail out here to prevent the giant code block from an indentation. https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize (PR #142790)
https://github.com/petar-avramovic created https://github.com/llvm/llvm-project/pull/142790 Add rules for G_AMDGPU_BUFFER_LOAD and implement waterfall lowering for divergent operands that must be sgpr. >From 6dd26d44b55420f91a1684e78938ea8b426680cc Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 4 Jun 2025 17:12:41 +0200 Subject: [PATCH] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize Add rules for G_AMDGPU_BUFFER_LOAD and implement waterfall lowering for divergent operands that must be sgpr. --- .../Target/AMDGPU/AMDGPUGlobalISelUtils.cpp | 54 +++- .../lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h | 2 + .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 240 +- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 28 +- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 2 + .../AMDGPU/GlobalISel/buffer-schedule.ll | 2 +- .../llvm.amdgcn.make.buffer.rsrc.ll | 2 +- .../regbankselect-amdgcn.raw.buffer.load.ll | 59 ++--- ...egbankselect-amdgcn.raw.ptr.buffer.load.ll | 59 ++--- ...regbankselect-amdgcn.struct.buffer.load.ll | 59 ++--- ...ankselect-amdgcn.struct.ptr.buffer.load.ll | 59 ++--- .../llvm.amdgcn.buffer.load-last-use.ll | 2 +- .../llvm.amdgcn.raw.atomic.buffer.load.ll | 42 +-- .../llvm.amdgcn.raw.ptr.atomic.buffer.load.ll | 42 +-- .../llvm.amdgcn.struct.atomic.buffer.load.ll | 48 ++-- ...vm.amdgcn.struct.ptr.atomic.buffer.load.ll | 48 ++-- .../CodeGen/AMDGPU/swizzle.bit.extract.ll | 4 +- 17 files changed, 512 insertions(+), 240 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index 00979f44f9d34..b3edb959e14c3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -117,45 +117,73 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) { return LLT::scalar(32); } -static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, - const RegisterBankInfo &RBI); +typedef std::function +ReadLaneFnTy; + +static Register buildReadLane(MachineIRBuilder &, Register, + const RegisterBankInfo &, ReadLaneFnTy); static void unmergeReadAnyLane(MachineIRBuilder &B, SmallVectorImpl &SgprDstParts, LLT UnmergeTy, Register VgprSrc, - const RegisterBankInfo &RBI) { + const RegisterBankInfo &RBI, + ReadLaneFnTy BuildRL) { const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID); auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc); for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { -SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI)); +SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL)); } } -static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, - const RegisterBankInfo &RBI) { +static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc, + const RegisterBankInfo &RBI, + ReadLaneFnTy BuildRL) { LLT Ty = B.getMRI()->getType(VgprSrc); const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID); if (Ty.getSizeInBits() == 32) { -return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc}) -.getReg(0); +Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty}); +return BuildRL(B, SgprDst, VgprSrc).getReg(0); } SmallVector SgprDstParts; - unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); + unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI, + BuildRL); return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0); } -void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, - Register VgprSrc, const RegisterBankInfo &RBI) { +static void buildReadLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI, + ReadLaneFnTy BuildReadLane) { LLT Ty = B.getMRI()->getType(VgprSrc); if (Ty.getSizeInBits() == 32) { -B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc}); +BuildReadLane(B, SgprDst, VgprSrc); return; } SmallVector SgprDstParts; - unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); + unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI, + BuildReadLane); B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0); } + +void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI) { + return bu
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -14935,6 +14936,52 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performPtrAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N1.getOpcode() == ISD::ADD) { +// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, +//y is not, and (add y, z) is used only once. +// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, +//z is not, and (add y, z) is used only once. +// The goal is to move constant offsets to the outermost ptradd, to create +// more opportunities to fold offsets into memory instructions. +// Together with the generic combines in DAGCombiner.cpp, this also +// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). +// +// This transform is here instead of in the general DAGCombiner as it can +// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for +// AArch64's CPA. arichardson wrote: I think having it here is fine, it explains why it can't be moved to a generic fold. https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [RISCV] Integrate RISCV target in baremetal toolchain object and deprecate RISCVToolchain object (PR #121831)
quic-garvgupt wrote: Hi @petrhosek , I understand you had some feedback regarding further splitting this PR however I responded to all of them. If you do not have any further comments/feedback, will really appreciate if you can approve this PR. Thanks again for reviewing all the patches! https://github.com/llvm/llvm-project/pull/121831 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -2627,6 +2629,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) jrtc27 wrote: Only if they're the same type. This isn't valid for CHERI, the LHS is a capability, the RHS is an integer. Nor is this valid for architectures where address size != index size. https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
jrtc27 wrote: > isNullConstant(X), since there are address spaces where 0 is a perfectly normal value that shouldn't be treated specially, I don't know if it's important for CHERI to have this or if the IR-level optimisations render it not so needed. But `NULL + int` is how we represent an integer as a pointer, so `NULL + x + y` is something that can legitimately turn up, and we want to be able to fold the x and y together as just integer arithmetic, only converting to a capability at the very end when needed. https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen] Limit number of analyzed predecessors (PR #142584)
kyulee-com wrote: Adding this threshold check within `isTrellis()` feels somewhat unnatural. If compile time is a concern, could we simply check the size of functions (in terms of the number of blocks, as opposed to predecessor only) early in this pass and either skip it or switch to a faster, simpler algorithm? Also 1000 size seems small, may be 1? https://github.com/llvm/llvm-project/pull/142584 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -2627,6 +2629,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) arichardson wrote: yeah this needs a ```suggestion if (isNullConstant(N0) && PtrVT == IntVT) ``` https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)
@@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s + +; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG +; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable +; similar transformations in that pass. + +; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use. +define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_ZTwoUses: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_ZTwoUses: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24 + %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset + %l = load i64, ptr addrspace(1) %gep1, align 8 + %r = add i64 %l, %voffset + ret i64 %r +} + +define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %add0 = add nuw nsw i64 %voffset, 24 + %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 + %l = load i64, ptr addrspace(1) %gep0, align 8 + ret i64 %l +} + +; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These +; would be folded away in most cases, but the index computation introduced by +; the legalization of wide vector stores can for example introduce them. +define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) { +; GFX942_PTRADD-LABEL: store_v16i32: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23 +; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX942_PTRADD-NEXT:s_nop 1 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s17 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s18 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s19 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX942_PTRADD-NEXT:s_nop 1 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s12 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s13 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s14 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s15 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX942_PTRADD-NEXT:s_nop 1 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s8 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s9 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s10 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s11 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942_PTRAD