[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -14944,6 +14945,51 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performPtrAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N1.getOpcode() == ISD::ADD) { +// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, +//y is not, and (add y, z) is used only once. +// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, +//z is not, and (add y, z) is used only once. +// The goal is to move constant offsets to the outermost ptradd, to create +// more opportunities to fold offsets into memory instructions. +// Together with the generic combines in DAGCombiner.cpp, this also +// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). +// +// This transform is here instead of in the general DAGCombiner as it can +// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for +// AArch64's CPA. +SDValue X = N0; +SDValue Y = N1.getOperand(0); +SDValue Z = N1.getOperand(1); +bool N1OneUse = N1.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); +if ((ZIsConstant != YIsConstant) && N1OneUse) { ritter-x2a wrote: Done. https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Tests for target-specific ISD::PTRADD combines (PR #143672)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143672 >From ac6d5eb285b1f56b5c32133279224feb2b8bd8a9 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 11 Jun 2025 05:14:34 -0400 Subject: [PATCH] [AMDGPU][SDAG] Tests for target-specific ISD::PTRADD combines Pre-committing tests to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 176 ++ 1 file changed, 176 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 2e76033a480f4..1ec94162951a6 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -142,3 +142,179 @@ entry: tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) ret void } + +; Test skipping the lower-32-bit addition if it is unnecessary. +define ptr @huge_offset_low_32_unused(ptr %p) { +; GFX942_PTRADD-LABEL: huge_offset_low_32_unused: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:s_mov_b32 s0, 0 +; GFX942_PTRADD-NEXT:s_mov_b32 s1, 1 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: huge_offset_low_32_unused: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_add_u32_e32 v1, 1, v1 +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep = getelementptr inbounds i8, ptr %p, i64 u0x1 + ret ptr %gep +} + +; Reassociate address computation if it leads to more scalar operations. +define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) { +; GFX942_PTRADD-LABEL: reassoc_scalar_r: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_PTRADD-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_PTRADD-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7] +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_PTRADD-NEXT:s_endpgm +; +; GFX942_LEGACY-LABEL: reassoc_scalar_r: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_LEGACY-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_LEGACY-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_LEGACY-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_LEGACY-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_LEGACY-NEXT:s_add_u32 s2, s2, s6 +; GFX942_LEGACY-NEXT:s_addc_u32 s3, s3, s7 +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_LEGACY-NEXT:s_endpgm +entry: + %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() + %voffset = zext i32 %voffset32 to i64 + %offset = add nuw nsw i64 %voffset, %soffset + %gep = getelementptr i8, ptr addrspace(1) %p, i64 %offset + store ptr addrspace(1) %gep, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) { +; GFX942_PTRADD-LABEL: reassoc_scalar_l: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_PTRADD-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_PTRADD-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1] +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_PTRADD-NEXT:s_endpgm +; +; GFX942_LEGACY-LABEL: reassoc_scalar_l: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_LEGACY-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_LEGACY-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_LEGACY-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_LEGACY-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_LEGACY-NEXT:s_add_u32 s2, s2, s6 +; GFX942_LEGACY-NEXT:s_addc_u32 s3, s3, s7 +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_LEGACY-NEXT:s_endpgm +entry: + %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() + %voffset = zext i32 %voffset32 to i64 + %offset = add nuw nsw i64 %soffset, %voffset + %gep = getelementptr i8, ptr addrspace(1) %p, i64 %offset + store ptr addrspace(1) %gep, ptr addrspace
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Handle ISD::PTRADD in SelectionDAGAddressAnalysis (PR #142778)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142778 >From af2d3ea3a17b2d7eec54fcf030ff89a1a0422e5a Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 09:48:02 -0400 Subject: [PATCH] [AMDGPU][SDAG] Handle ISD::PTRADD in SelectionDAGAddressAnalysis This is used in a bunch of memory-related transforms. For SWDEV-516125. --- .../SelectionDAGAddressAnalysis.cpp | 6 ++-- .../AMDGPU/ptradd-sdag-optimizations.ll | 28 ++- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index f2ab88851b780..da92aaa860b2b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -231,6 +231,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, } break; case ISD::ADD: +case ISD::PTRADD: if (auto *C = dyn_cast(Base->getOperand(1))) { Offset += C->getSExtValue(); Base = DAG.getTargetLoweringInfo().unwrapAddress(Base->getOperand(0)); @@ -259,7 +260,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, break; } - if (Base->getOpcode() == ISD::ADD) { + if (Base->isAnyAdd()) { // TODO: The following code appears to be needless as it just // bails on some Ptrs early, reducing the cases where we // find equivalence. We should be able to remove this. @@ -282,8 +283,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, } // Check if Index Offset pattern -if (Index->getOpcode() != ISD::ADD || -!isa(Index->getOperand(1))) +if (!Index->isAnyAdd() || !isa(Index->getOperand(1))) return BaseIndexOffset(PotentialBase, Index, Offset, IsIndexSignExt); Offset += cast(Index->getOperand(1))->getSExtValue(); diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index d3242905ada64..2e76033a480f4 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -130,26 +130,14 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; Taken from memcpy-param-combinations.ll, tests PTRADD handling in ; SelectionDAGAddressAnalysis. define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { -; GFX942_PTRADD-LABEL: memcpy_p1_p4_sz16_align_1_1: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT:global_load_dwordx2 v[4:5], v[2:3], off -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[4:5], off -; GFX942_PTRADD-NEXT:global_load_dwordx2 v[2:3], v[2:3], off offset:8 -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[2:3], off offset:8 -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: memcpy_p1_p4_sz16_align_1_1: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT:global_load_dwordx4 v[2:5], v[2:3], off -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT:global_store_dwordx4 v[0:1], v[2:5], off -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] +; GFX942-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:global_load_dwordx4 v[2:5], v[2:3], off +; GFX942-NEXT:s_waitcnt vmcnt(0) +; GFX942-NEXT:global_store_dwordx4 v[0:1], v[2:5], off +; GFX942-NEXT:s_waitcnt vmcnt(0) +; GFX942-NEXT:s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) ret void ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] [libc++][C++03] Remove XFAILs from the non-frozen libc++-specific tests (PR #144101)
https://github.com/philnik777 created https://github.com/llvm/llvm-project/pull/144101 The tests in `libcxx/test/libcxx` aren't run against the frozen headers anymore, so we can remove any XFAILs in them. This is part of https://discourse.llvm.org/t/rfc-freezing-c-03-headers-in-libc. >From e080572b8168260ecb4c8b2be39111d579056f74 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Fri, 13 Jun 2025 17:49:01 +0200 Subject: [PATCH] [libc++][C++03] Remove XFAILs from the non-frozen libc++-specific tests --- libcxx/test/libcxx/algorithms/half_positive.pass.cpp | 2 -- libcxx/test/libcxx/algorithms/vectorization.compile.pass.cpp | 2 -- .../assertions/customize_verbose_abort.link-time.pass.cpp | 2 -- libcxx/test/libcxx/assertions/default_verbose_abort.pass.cpp | 2 -- libcxx/test/libcxx/assertions/modes/none.pass.cpp | 2 -- libcxx/test/libcxx/assertions/single_expression.pass.cpp | 2 -- .../atomics.types.operations.req/atomic_fetch_add.verify.cpp | 2 -- .../atomic_fetch_add_explicit.verify.cpp | 2 -- .../atomics.types.operations.req/atomic_fetch_sub.verify.cpp | 2 -- .../atomic_fetch_sub_explicit.verify.cpp | 2 -- libcxx/test/libcxx/clang_modules_include.gen.py| 2 -- libcxx/test/libcxx/clang_tidy.gen.py | 3 --- .../containers/associative/tree_balance_after_insert.pass.cpp | 2 -- .../containers/associative/tree_key_value_traits.pass.cpp | 2 -- .../libcxx/containers/associative/tree_left_rotate.pass.cpp| 2 -- libcxx/test/libcxx/containers/associative/tree_remove.pass.cpp | 2 -- .../libcxx/containers/associative/tree_right_rotate.pass.cpp | 2 -- .../containers/associative/unord.map/abi.compile.pass.cpp | 2 -- .../containers/associative/unord.set/abi.compile.pass.cpp | 2 -- .../test/libcxx/containers/container_traits.compile.pass.cpp | 2 -- libcxx/test/libcxx/containers/unord/key_value_traits.pass.cpp | 2 -- libcxx/test/libcxx/containers/unord/next_pow2.pass.cpp | 2 -- libcxx/test/libcxx/containers/unord/next_prime.pass.cpp| 2 -- libcxx/test/libcxx/depr/depr.c.headers/extern_c.pass.cpp | 2 -- .../libcxx/experimental/fexperimental-library.compile.pass.cpp | 2 -- libcxx/test/libcxx/header_inclusions.gen.py| 1 - .../string.streams/stringbuf/const_sso_buffer.pass.cpp | 2 -- libcxx/test/libcxx/iterators/aliasing_iterator.pass.cpp| 2 -- libcxx/test/libcxx/iterators/bounded_iter/arithmetic.pass.cpp | 2 -- libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp | 2 -- .../test/libcxx/iterators/bounded_iter/pointer_traits.pass.cpp | 2 -- .../test/libcxx/iterators/bounded_iter/types.compile.pass.cpp | 2 -- .../iterators/contiguous_iterators.conv.compile.pass.cpp | 2 -- libcxx/test/libcxx/iterators/contiguous_iterators.verify.cpp | 2 -- .../iterator.primitives/iterator.operations/prev.verify.cpp| 2 -- .../language.support/support.dynamic/libcpp_deallocate.sh.cpp | 2 -- libcxx/test/libcxx/memory/allocation_guard.pass.cpp| 2 -- libcxx/test/libcxx/memory/swap_allocator.pass.cpp | 2 -- libcxx/test/libcxx/numerics/bit.ops.pass.cpp | 2 -- libcxx/test/libcxx/numerics/clamp_to_integral.pass.cpp | 2 -- .../libcxx/numerics/complex.number/cmplx.over.pow.pass.cpp | 2 -- libcxx/test/libcxx/selftest/test_macros.pass.cpp | 2 -- .../strings/basic.string/string.capacity/max_size.pass.cpp | 2 -- .../test/libcxx/strings/c.strings/constexpr_memmove.pass.cpp | 2 -- libcxx/test/libcxx/system_reserved_names.gen.py| 2 -- libcxx/test/libcxx/transitive_includes.gen.py | 2 -- libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp | 2 -- libcxx/test/libcxx/type_traits/desugars_to.compile.pass.cpp| 2 -- libcxx/test/libcxx/type_traits/is_constant_evaluated.pass.cpp | 2 -- libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp | 2 -- .../type_traits/is_trivially_comparable.compile.pass.cpp | 2 -- .../type_traits/is_trivially_relocatable.compile.pass.cpp | 2 -- libcxx/test/libcxx/utilities/exception_guard.odr.sh.cpp| 2 -- .../function.objects/refwrap/desugars_to.compile.pass.cpp | 2 -- libcxx/test/libcxx/utilities/is_pointer_in_range.pass.cpp | 2 -- libcxx/test/libcxx/utilities/is_valid_range.pass.cpp | 2 -- .../libcxx/utilities/meta/is_referenceable.compile.pass.cpp| 2 -- libcxx/test/libcxx/utilities/meta/meta_base.pass.cpp | 2 -- libcxx/test/libcxx/utilities/no_destroy.pass.cpp | 2 -- libcxx/test/libcxx/utilities/template.bitset/includes.pass.cpp | 2 -- .../utilities/utility/private_constructor_tag.compile.pass.cpp | 2 -- 61 files changed, 122 deletions(-) diff --git a/libcxx/test/libcxx/algorithms/half_positive.pass.cpp b/libcxx/test/libcxx/algorithms/half_positiv
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Test ISD::PTRADD handling in VOP3 patterns (PR #143880)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143880 >From 3f69917b67760c64fdafcb42b5783b8aaafb1406 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Thu, 12 Jun 2025 06:13:26 -0400 Subject: [PATCH] [AMDGPU][SDAG] Test ISD::PTRADD handling in VOP3 patterns Pre-committing tests to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 45 +++ 1 file changed, 45 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index c00bccdbce6b7..d48bfe0bb7f21 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -263,3 +263,48 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) { store float 1.0, ptr addrspace(1) %p1 ret void } + +; Use non-zero shift amounts in v_lshl_add_u64. +define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: select_v_lshl_add_u64: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshlrev_b64 v[2:3], 3, v[2:3] +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: select_v_lshl_add_u64: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep = getelementptr inbounds i64, ptr %base, i64 %voffset + ret ptr %gep +} + +; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the +; mul into a mul24. +define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) { +; GFX942_PTRADD-LABEL: fold_mul24_into_mad: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_and_b32_e32 v2, 0xf, v2 +; GFX942_PTRADD-NEXT:v_and_b32_e32 v4, 0xf, v4 +; GFX942_PTRADD-NEXT:v_mul_hi_u32_u24_e32 v3, v2, v4 +; GFX942_PTRADD-NEXT:v_mul_u32_u24_e32 v2, v2, v4 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: fold_mul24_into_mad: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_and_b32_e32 v2, 0xf, v2 +; GFX942_LEGACY-NEXT:v_and_b32_e32 v3, 0xf, v4 +; GFX942_LEGACY-NEXT:v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %a_masked = and i64 %a, u0xf + %b_masked = and i64 %b, u0xf + %mul = mul i64 %a_masked, %b_masked + %gep = getelementptr inbounds i8, ptr %base, i64 %mul + ret ptr %gep +} ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` (PR #142422)
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/142422 >From b950757c234900db941ed950ea3469b520d2e28a Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 2 Jun 2025 15:13:13 + Subject: [PATCH 1/8] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` Previously, slices were sometimes marked as non-contiguous when they were actually contiguous. This occurred when the vector type had leading unit dimensions, e.g., `vector<1x1x...x1xd0xd1x...xdn-1xT>``. In such cases, only the trailing n dimensions of the memref need to be contiguous, not the entire vector rank. This affects how `FlattenContiguousRowMajorTransfer{Read,Write}Pattern` flattens `transfer_read` and `transfer_write`` ops. The pattern used to collapse a number of dimensions equal the vector rank, which may be is incorrect when leading dimensions are unit-sized. This patch fixes the issue by collapsing only as many trailing memref dimensions as are actually contiguous. --- .../mlir/Dialect/Vector/Utils/VectorUtils.h | 54 - .../Transforms/VectorTransferOpTransforms.cpp | 8 +- mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp | 25 ++-- .../Vector/vector-transfer-flatten.mlir | 108 +- 4 files changed, 120 insertions(+), 75 deletions(-) diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h index 6609b28d77b6c..ed06d7a029494 100644 --- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h +++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h @@ -49,35 +49,37 @@ FailureOr> isTranspose2DSlice(vector::TransposeOp op); /// Return true if `vectorType` is a contiguous slice of `memrefType`. /// -/// Only the N = vectorType.getRank() trailing dims of `memrefType` are -/// checked (the other dims are not relevant). Note that for `vectorType` to be -/// a contiguous slice of `memrefType`, the trailing dims of the latter have -/// to be contiguous - this is checked by looking at the corresponding strides. +/// The leading unit dimensions of the vector type are ignored as they +/// are not relevant to the result. Let N be the number of the vector +/// dimensions after ignoring a leading sequence of unit ones. /// -/// There might be some restriction on the leading dim of `VectorType`: +/// For `vectorType` to be a contiguous slice of `memrefType` +/// a) the N trailing dimensions of the latter must be contiguous, and +/// b) the trailing N dimensions of `vectorType` and `memrefType`, +/// except the first of them, must match. /// -/// Case 1. If all the trailing dims of `vectorType` match the trailing dims -/// of `memrefType` then the leading dim of `vectorType` can be -/// arbitrary. -/// -///Ex. 1.1 contiguous slice, perfect match -/// vector<4x3x2xi32> from memref<5x4x3x2xi32> -///Ex. 1.2 contiguous slice, the leading dim does not match (2 != 4) -/// vector<2x3x2xi32> from memref<5x4x3x2xi32> -/// -/// Case 2. If an "internal" dim of `vectorType` does not match the -/// corresponding trailing dim in `memrefType` then the remaining -/// leading dims of `vectorType` have to be 1 (the first non-matching -/// dim can be arbitrary). +/// Examples: /// -///Ex. 2.1 non-contiguous slice, 2 != 3 and the leading dim != <1> -/// vector<2x2x2xi32> from memref<5x4x3x2xi32> -///Ex. 2.2 contiguous slice, 2 != 3 and the leading dim == <1> -/// vector<1x2x2xi32> from memref<5x4x3x2xi32> -///Ex. 2.3. contiguous slice, 2 != 3 and the leading dims == <1x1> -/// vector<1x1x2x2xi32> from memref<5x4x3x2xi32> -///Ex. 2.4. non-contiguous slice, 2 != 3 and the leading dims != <1x1> -/// vector<2x1x2x2xi32> from memref<5x4x3x2xi32>) +/// Ex.1 contiguous slice, perfect match +/// vector<4x3x2xi32> from memref<5x4x3x2xi32> +/// Ex.2 contiguous slice, the leading dim does not match (2 != 4) +/// vector<2x3x2xi32> from memref<5x4x3x2xi32> +/// Ex.3 non-contiguous slice, 2 != 3 +/// vector<2x2x2xi32> from memref<5x4x3x2xi32> +/// Ex.4 contiguous slice, leading unit dimension of the vector ignored, +///2 != 3 (allowed) +/// vector<1x2x2xi32> from memref<5x4x3x2xi32> +/// Ex.5. contiguous slice, leasing two unit dims of the vector ignored, +/// 2 != 3 (allowed) +/// vector<1x1x2x2xi32> from memref<5x4x3x2xi32> +/// Ex.6. non-contiguous slice, 2 != 3, no leading sequence of unit dims +/// vector<2x1x2x2xi32> from memref<5x4x3x2xi32>) +/// Ex.7 contiguous slice, memref needs to be contiguous only on the last +///dimension +/// vector<1x1x2xi32> from memref<2x2x2xi32, strided<[8, 4, 1]>> +/// Ex.8 non-contiguous slice, memref needs to be contiguous one the last +///two dimensions, and it isn't +/// vector<1x2x2xi32> from memref<2x2x2xi32, strided<[8, 4, 1]>> bool isContiguo
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines (PR #143673)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143673 >From 10494be4478143e69a6116653228170195c00dc2 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 11 Jun 2025 05:48:45 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125. --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 139 .../AMDGPU/ptradd-sdag-optimizations.ll | 151 ++ 3 files changed, 160 insertions(+), 134 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 45a37622a531b..1210777428020 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6706,7 +6706,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT, return SDValue(); int64_t Offset = C2->getSExtValue(); switch (Opcode) { - case ISD::ADD: break; + case ISD::ADD: + case ISD::PTRADD: +break; case ISD::SUB: Offset = -uint64_t(Offset); break; default: return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f645b09db734b..bd123fc4ffd1b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -46,6 +47,7 @@ #include using namespace llvm; +using namespace llvm::SDPatternMatch; #define DEBUG_TYPE "si-lower" @@ -14329,7 +14331,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, // instead of a tree. SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::ADD); + assert(N->isAnyAdd()); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -14362,7 +14364,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, for (SDNode *User : LHS->users()) { // There is a use that does not feed into addition, so the multiply can't // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. - if (User->getOpcode() != ISD::ADD) + if (!User->isAnyAdd()) return SDValue(); // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer @@ -14474,8 +14476,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, SDValue Hi = getHiHalf64(LHS, DAG); SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); +unsigned Opcode = N->getOpcode(); +if (Opcode == ISD::PTRADD) + Opcode = ISD::ADD; SDValue AddHi = -DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); +DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); @@ -14949,42 +14954,116 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::ADD) { -// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, -//y is not, and (add y, z) is used only once. -// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, -//z is not, and (add y, z) is used only once. -// The goal is to move constant offsets to the outermost ptradd, to create -// more opportunities to fold offsets into memory instructions. -// Together with the generic combines in DAGCombiner.cpp, this also -// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). -// -// This transform is here instead of in the general DAGCombiner as it can -// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for -// AArch64's CPA. -SDValue X = N0; -SDValue Y = N1.getOperand(0); -SDValue Z = N1.getOperand(1); -if (N1.hasOneUse()) { - bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); - bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); - if (ZIsConstant != YIsConstant) { -// If both additions in the original were NUW, the new ones are as well. -
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142739 >From 6ea714e83e4714d9fe025e5e9fee48b41f223cb8 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 03:32:32 -0400 Subject: [PATCH 1/6] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one that is closely connected. The generic DAG combine is based on a part of PR #105669 by @rgwott, which was adapted from work by @jrtc27, @arichardson, @davidchisnall in the CHERI/Morello LLVM tree. I added some parts and removed several disjuncts from the reassociation condition: - `isNullConstant(X)`, since there are address spaces where 0 is a perfectly normal value that shouldn't be treated specially, - `(YIsConstant && ZOneUse)` and `(N0OneUse && ZOneUse && !ZIsConstant)`, since they cause regressions in AMDGPU. For SWDEV-516125. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 92 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 49 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + .../AMDGPU/ptradd-sdag-optimizations.ll | 194 ++ 4 files changed, 201 insertions(+), 135 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5d62ded171f4f..505cb264ae948 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -419,6 +419,7 @@ namespace { SDValue visitADDLike(SDNode *N); SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); +SDValue visitPTRADD(SDNode *N); SDValue visitSUB(SDNode *N); SDValue visitADDSAT(SDNode *N); SDValue visitSUBSAT(SDNode *N); @@ -1138,7 +1139,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, return true; } - if (Opc != ISD::ADD) + if (Opc != ISD::ADD && Opc != ISD::PTRADD) return false; auto *C2 = dyn_cast(N1); @@ -1858,6 +1859,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::TokenFactor:return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD:return visitADD(N); + case ISD::PTRADD: return visitPTRADD(N); case ISD::SUB:return visitSUB(N); case ISD::SADDSAT: case ISD::UADDSAT:return visitADDSAT(N); @@ -2628,6 +2630,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) +return N1; + + if (N0.getOpcode() == ISD::PTRADD && + !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) { +SDValue X = N0.getOperand(0); +SDValue Y = N0.getOperand(1); +SDValue Z = N1; +bool N0OneUse = N0.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + +// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if: +// * y is a constant and (ptradd x, y) has one use; or +// * y and z are both constants. +if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && + N0->getFlags().hasNoUnsignedWrap()) +Flags |= SDNodeFlags::NoUnsignedWrap; + SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); + AddToWorklist(Add.getNode()); + return DAG.getMemBasePlusOffset(X, Add, DL, Flags); +} + +// TODO: There is another possible fold here that was proven useful. +// It would be this: +// +// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if: +// * (ptradd x, y) has one use; and +// * y is a constant; and +// * z is not a constant. +// +// In some cases, specifically in AArch64's FEAT_CPA, it exposes the +// opportunity to select more complex instructions such as SUBPT and +// MSUBPT. H
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis (PR #142777)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142777 >From e8eccce3f9221dd52f15341873b03f220ef84739 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 09:30:34 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis Pre-committing test to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 28 +++ 1 file changed, 28 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index b78dea1684545..d3242905ada64 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -126,3 +126,31 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { store volatile i64 %dispatch.id, ptr addrspace(1) %ptr ret void } + +; Taken from memcpy-param-combinations.ll, tests PTRADD handling in +; SelectionDAGAddressAnalysis. +define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { +; GFX942_PTRADD-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[4:5], v[2:3], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[4:5], off +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[2:3], v[2:3], off offset:8 +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[2:3], off offset:8 +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:global_load_dwordx4 v[2:5], v[2:3], off +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:global_store_dwordx4 v[0:1], v[2:5], off +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) + ret void +} ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [IR2Vec] Scale embeddings once in vocab analysis instead of repetitive scaling (PR #143986)
@@ -259,32 +306,40 @@ Error IR2VecVocabAnalysis::readVocabulary() { return createFileError(VocabFile, BufOrError.getError()); auto Content = BufOrError.get()->getBuffer(); - json::Path::Root Path(""); + Expected ParsedVocabValue = json::parse(Content); if (!ParsedVocabValue) return ParsedVocabValue.takeError(); - bool Res = json::fromJSON(*ParsedVocabValue, Vocabulary, Path); - if (!Res) -return createStringError(errc::illegal_byte_sequence, - "Unable to parse the vocabulary"); + ir2vec::Vocab OpcodeVocab, TypeVocab, ArgVocab; + unsigned OpcodeDim, TypeDim, ArgDim; mtrofin wrote: Initialize at declaration https://github.com/llvm/llvm-project/pull/143986 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [IR2Vec] Scale embeddings once in vocab analysis instead of repetitive scaling (PR #143986)
@@ -259,32 +306,40 @@ Error IR2VecVocabAnalysis::readVocabulary() { return createFileError(VocabFile, BufOrError.getError()); auto Content = BufOrError.get()->getBuffer(); - json::Path::Root Path(""); + Expected ParsedVocabValue = json::parse(Content); if (!ParsedVocabValue) return ParsedVocabValue.takeError(); - bool Res = json::fromJSON(*ParsedVocabValue, Vocabulary, Path); - if (!Res) -return createStringError(errc::illegal_byte_sequence, - "Unable to parse the vocabulary"); + ir2vec::Vocab OpcodeVocab, TypeVocab, ArgVocab; + unsigned OpcodeDim, TypeDim, ArgDim; + if (auto Err = parseVocabSection("Opcodes", *ParsedVocabValue, OpcodeVocab, mtrofin wrote: This changes the format, best to also update the doc. Also, this means the sections must all be present, even if empty, correct? SGTM, just something worth spelling out. https://github.com/llvm/llvm-project/pull/143986 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [IR2Vec] Scale embeddings once in vocab analysis instead of repetitive scaling (PR #143986)
https://github.com/mtrofin edited https://github.com/llvm/llvm-project/pull/143986 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen] Limit number of analyzed predecessors (PR #142584)
aengelke wrote: Reused an existing test case, this also shows the difference in the resulting block order. If preferred, I can also write a separate test case. https://github.com/llvm/llvm-project/pull/142584 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` (PR #142422)
https://github.com/momchil-velikov edited https://github.com/llvm/llvm-project/pull/142422 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] b81d5e0 - [InstCombine] Fold shuffles through all trivially vectorizable intrinsics (#141979)
Author: Luke Lau Date: 2025-06-13T18:25:07+01:00 New Revision: b81d5e06c7cba8c9f1f5380daed4b9ee139214ba URL: https://github.com/llvm/llvm-project/commit/b81d5e06c7cba8c9f1f5380daed4b9ee139214ba DIFF: https://github.com/llvm/llvm-project/commit/b81d5e06c7cba8c9f1f5380daed4b9ee139214ba.diff LOG: [InstCombine] Fold shuffles through all trivially vectorizable intrinsics (#141979) This addresses a TODO in foldShuffledIntrinsicOperands to use isTriviallyVectorizable instead of a hardcoded list of intrinsics, which in turn allows more intriniscs to be scalarized by VectorCombine. >From what I can tell every intrinsic here should be speculatable so an assertion was added. Because this enables intrinsics like abs which have a scalar operand, we need to also check isVectorIntrinsicWithScalarOpAtArg. Added: Modified: llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp llvm/test/Transforms/InstCombine/abs-1.ll llvm/test/Transforms/InstCombine/fma.ll llvm/test/Transforms/InstCombine/minmax-intrinsics.ll llvm/test/Transforms/InstCombine/powi.ll llvm/test/Transforms/InstCombine/scmp.ll llvm/test/Transforms/InstCombine/sqrt.ll llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll Removed: diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index c169ab25b2106..8c8cc0859e4af 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1400,42 +1400,46 @@ static Instruction *factorizeMinMaxTree(IntrinsicInst *II) { /// try to shuffle after the intrinsic. Instruction * InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) { - // TODO: This should be extended to handle other intrinsics like fshl, ctpop, - // etc. Use llvm::isTriviallyVectorizable() and related to determine - // which intrinsics are safe to shuffle? - switch (II->getIntrinsicID()) { - case Intrinsic::smax: - case Intrinsic::smin: - case Intrinsic::umax: - case Intrinsic::umin: - case Intrinsic::fma: - case Intrinsic::fshl: - case Intrinsic::fshr: -break; - default: + if (!isTriviallyVectorizable(II->getIntrinsicID()) || + !II->getCalledFunction()->isSpeculatable()) +return nullptr; + + // fabs is canonicalized to fabs (shuffle ...) in foldShuffleOfUnaryOps, so + // avoid undoing it. + if (match(II, m_FAbs(m_Value( return nullptr; - } Value *X; Constant *C; ArrayRef Mask; - auto *NonConstArg = find_if_not(II->args(), IsaPred); + auto *NonConstArg = find_if_not(II->args(), [&II](Use &Arg) { +return isa(Arg.get()) || + isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), + Arg.getOperandNo(), nullptr); + }); if (!NonConstArg || !match(NonConstArg, m_Shuffle(m_Value(X), m_Poison(), m_Mask(Mask return nullptr; - // At least 1 operand must have 1 use because we are creating 2 instructions. - if (none_of(II->args(), [](Value *V) { return V->hasOneUse(); })) + // At least 1 operand must be a shuffle with 1 use because we are creating 2 + // instructions. + if (none_of(II->args(), [](Value *V) { +return isa(V) && V->hasOneUse(); + })) return nullptr; // See if all arguments are shuffled with the same mask. SmallVector NewArgs; Type *SrcTy = X->getType(); - for (Value *Arg : II->args()) { -if (match(Arg, m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) && -X->getType() == SrcTy) + for (Use &Arg : II->args()) { +if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), + Arg.getOperandNo(), nullptr)) + NewArgs.push_back(Arg); +else if (match(&Arg, + m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) && + X->getType() == SrcTy) NewArgs.push_back(X); -else if (match(Arg, m_ImmConstant(C))) { +else if (match(&Arg, m_ImmConstant(C))) { // If it's a constant, try find the constant that would be shuffled to C. if (Constant *ShuffledC = unshuffleConstant(Mask, C, cast(SrcTy))) @@ -1448,8 +1452,12 @@ InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) { // intrinsic (shuf X, M), (shuf Y, M), ... --> shuf (intrinsic X, Y, ...), M Instruction *FPI = isa(II) ? II : nullptr; + // Result type might be a diff erent vector width. + // TODO: Check that the result type isn't widened? + VectorType *ResTy = + VectorType::get(II->getType()->getScalarType(), cast(SrcTy)); Value *NewIntrinsic = - Builder.CreateIntr
[llvm-branch-commits] [libc] fd43215 - [libc] Fix bugs found when testing with all headers (#144049)
Author: William Huynh Date: 2025-06-13T10:26:40-07:00 New Revision: fd432151a607a997c417f32cb70650fc7728629a URL: https://github.com/llvm/llvm-project/commit/fd432151a607a997c417f32cb70650fc7728629a DIFF: https://github.com/llvm/llvm-project/commit/fd432151a607a997c417f32cb70650fc7728629a.diff LOG: [libc] Fix bugs found when testing with all headers (#144049) Fixes a couple of bugs found when building. The PR to enable the headers can be found here: #144114. - math.yaml: float128 guard - wchar.yaml: __restrict keyword order Added: Modified: libc/include/math.yaml libc/include/wchar.yaml libc/test/src/stdio/printf_core/converter_test.cpp Removed: diff --git a/libc/include/math.yaml b/libc/include/math.yaml index 466c08ade6fc4..11bead0745954 100644 --- a/libc/include/math.yaml +++ b/libc/include/math.yaml @@ -734,7 +734,7 @@ functions: - type: float128 - type: float128 - type: float128 -guards: LIBC_TYPES_HAS_FLOAT128 +guard: LIBC_TYPES_HAS_FLOAT128 - name: ffmal standards: - stdc diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 1af15a6c112b5..84db73d8f01ea 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -109,8 +109,8 @@ functions: - stdc return_type: wchar_t * arguments: - - type: __restrict wchar_t * - - type: const __restrict wchar_t * + - type: wchar_t *__restrict + - type: const wchar_t *__restrict - type: size_t - name: wmemmove standards: @@ -125,16 +125,16 @@ functions: - stdc return_type: wchar_t * arguments: - - type: __restrict wchar_t * - - type: const __restrict wchar_t * + - type: wchar_t *__restrict + - type: const wchar_t *__restrict - type: size_t - name: wcscat standards: - stdc return_type: wchar_t * arguments: - - type: __restrict wchar_t * - - type: const __restrict wchar_t * + - type: wchar_t *__restrict + - type: const wchar_t *__restrict - name: wcsstr standards: - stdc @@ -147,13 +147,13 @@ functions: - stdc return_type: wchar_t * arguments: - - type: __restrict wchar_t * - - type: const __restrict wchar_t * + - type: wchar_t *__restrict + - type: const wchar_t *__restrict - type: size_t - name: wcscpy standards: - stdc return_type: wchar_t * arguments: - - type: __restrict wchar_t * - - type: const __restrict wchar_t * + - type: wchar_t *__restrict + - type: const wchar_t *__restrict diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp index 96a00ae598ec2..bf088937e4104 100644 --- a/libc/test/src/stdio/printf_core/converter_test.cpp +++ b/libc/test/src/stdio/printf_core/converter_test.cpp @@ -124,7 +124,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) { TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) { LIBC_NAMESPACE::printf_core::FormatSection high_precision_conv; high_precision_conv.has_conv = true; - high_precision_conv.raw_string = "%4s"; + high_precision_conv.raw_string = "%.4s"; high_precision_conv.conv_name = 's'; high_precision_conv.precision = 4; high_precision_conv.conv_val_ptr = const_cast("456"); ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libc] c609112 - Fix/reapply "[libc] Migrate stdio tests to ErrnoCheckingTest." (#143972)
Author: Alexey Samsonov Date: 2025-06-13T10:25:26-07:00 New Revision: c609112a5383c10272e3afceedd4d03f26437cf0 URL: https://github.com/llvm/llvm-project/commit/c609112a5383c10272e3afceedd4d03f26437cf0 DIFF: https://github.com/llvm/llvm-project/commit/c609112a5383c10272e3afceedd4d03f26437cf0.diff LOG: Fix/reapply "[libc] Migrate stdio tests to ErrnoCheckingTest." (#143972) This reverts commit a93e55e57ed00a55f822c64e3520c7c732b58480 and fixes build and test failures: * Proper include added to setvbuf_test.cpp * fgetc/fgetc_unlocked/fgets tests are ported to ErrnoSetterMatcher and are made more precise. This fixes inconsistencies between expectations in regular and GPU builds - ErrnoSetterMatcher is configured to omit errno matching on GPUs, as fgetc implementation on GPU doesn't set errno, in contrast to Linux. Added: Modified: libc/test/src/stdio/CMakeLists.txt libc/test/src/stdio/fdopen_test.cpp libc/test/src/stdio/fgetc_test.cpp libc/test/src/stdio/fgetc_unlocked_test.cpp libc/test/src/stdio/fgets_test.cpp libc/test/src/stdio/fileop_test.cpp libc/test/src/stdio/fopencookie_test.cpp libc/test/src/stdio/remove_test.cpp libc/test/src/stdio/rename_test.cpp libc/test/src/stdio/setvbuf_test.cpp libc/test/src/stdio/unlocked_fileop_test.cpp libc/test/src/stdlib/StrtolTest.h libc/test/src/stdlib/strtold_test.cpp Removed: diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt index ce2171f19597b..4aa8b95880018 100644 --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -20,6 +20,7 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fseek libc.src.stdio.fwrite +libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -68,6 +69,7 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fwrite libc.src.stdio.setvbuf +libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -88,6 +90,7 @@ add_libc_test( libc.src.stdio.fread_unlocked libc.src.stdio.funlockfile libc.src.stdio.fwrite_unlocked +libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -109,6 +112,7 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fseek libc.src.stdio.fwrite +libc.test.UnitTest.ErrnoCheckingTest LINK_LIBRARIES LibcMemoryHelpers ) @@ -438,6 +442,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.sys.stat.mkdirat libc.src.unistd.access libc.src.unistd.close + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -452,6 +457,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.stdio.rename libc.src.unistd.access libc.src.unistd.close + libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) @@ -468,6 +474,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.stdio.fgets libc.src.stdio.fputs libc.src.unistd.close + libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) endif() @@ -488,6 +495,8 @@ add_libc_test( libc.src.stdio.fopen libc.src.stdio.fwrite libc.src.stdio.getc +libc.test.UnitTest.ErrnoCheckingTest +libc.test.UnitTest.ErrnoSetterMatcher ) add_libc_test( @@ -510,6 +519,8 @@ add_libc_test( libc.src.stdio.funlockfile libc.src.stdio.fwrite libc.src.stdio.getc_unlocked +libc.test.UnitTest.ErrnoCheckingTest +libc.test.UnitTest.ErrnoSetterMatcher ) add_libc_test( @@ -527,6 +538,8 @@ add_libc_test( libc.src.stdio.fgets libc.src.stdio.fopen libc.src.stdio.fwrite +libc.test.UnitTest.ErrnoCheckingTest +libc.test.UnitTest.ErrnoSetterMatcher ) add_libc_test( diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp index 104fc478b100e..b53184c30be36 100644 --- a/libc/test/src/stdio/fdopen_test.cpp +++ b/libc/test/src/stdio/fdopen_test.cpp @@ -9,20 +9,21 @@ #include "src/stdio/fdopen.h" #include "hdr/fcntl_macros.h" -#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/fclose.h" #include "src/stdio/fgets.h" #include "src/stdio/fputs.h" #include "src/unistd/close.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include // For S_IRWXU -TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) { +using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); @@ -52,8 +53,7 @@ TEST(LlvmLib
[llvm-branch-commits] [clang] 9a30822 - [CIR][NFC] Fix forrange.cpp test (#144123)
Author: Andy Kaylor Date: 2025-06-13T10:28:03-07:00 New Revision: 9a3082276d21873a37925d0c6ad89bd28d065cea URL: https://github.com/llvm/llvm-project/commit/9a3082276d21873a37925d0c6ad89bd28d065cea DIFF: https://github.com/llvm/llvm-project/commit/9a3082276d21873a37925d0c6ad89bd28d065cea.diff LOG: [CIR][NFC] Fix forrange.cpp test (#144123) A recent change has cause the begin and end iterators in the forrange.cpp CIR codegen test to be marked as 'init' causing the test to fail. This change fixes the checks in the test. Added: Modified: clang/test/CIR/CodeGen/forrange.cpp Removed: diff --git a/clang/test/CIR/CodeGen/forrange.cpp b/clang/test/CIR/CodeGen/forrange.cpp index 6b6ccc79e59dd..45e146e9091d0 100644 --- a/clang/test/CIR/CodeGen/forrange.cpp +++ b/clang/test/CIR/CodeGen/forrange.cpp @@ -115,8 +115,8 @@ void for_range3() { // CIR:%[[C_ADDR:.*]] = cir.alloca !rec_C3{{.*}} ["c"] // CIR:cir.scope { // CIR: %[[RANGE_ADDR:.*]] = cir.alloca !cir.ptr{{.*}} ["__range1", init, const] -// CIR: %[[BEGIN_ADDR:.*]] = cir.alloca !rec_Iterator, !cir.ptr{{.*}} ["__begin1"] -// CIR: %[[END_ADDR:.*]] = cir.alloca !rec_Iterator, !cir.ptr{{.*}} ["__end1"] +// CIR: %[[BEGIN_ADDR:.*]] = cir.alloca !rec_Iterator, !cir.ptr{{.*}} ["__begin1", init] +// CIR: %[[END_ADDR:.*]] = cir.alloca !rec_Iterator, !cir.ptr{{.*}} ["__end1", init] // CIR: %[[E_ADDR:.*]] = cir.alloca !cir.ptr{{.*}} ["e", init, const] // CIR: cir.store{{.*}} %[[C_ADDR]], %[[RANGE_ADDR]] // CIR: cir.for : cond { ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] 62eea86 - [CIR] Update isSized with upstreamed types (#143960)
Author: Amr Hesham Date: 2025-06-13T19:29:21+02:00 New Revision: 62eea86424c4eacd38ad8a03f4bdae78687e3ade URL: https://github.com/llvm/llvm-project/commit/62eea86424c4eacd38ad8a03f4bdae78687e3ade DIFF: https://github.com/llvm/llvm-project/commit/62eea86424c4eacd38ad8a03f4bdae78687e3ade.diff LOG: [CIR] Update isSized with upstreamed types (#143960) Update `isSized` function with the upstreamed types Added: Modified: clang/lib/CIR/CodeGen/CIRGenBuilder.h clang/lib/CIR/CodeGen/CIRGenTypes.cpp clang/test/CIR/CodeGen/array.cpp Removed: diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h index 36c89809b4d90..a4bc69619d60c 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h +++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h @@ -139,8 +139,9 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy { } bool isSized(mlir::Type ty) { -if (mlir::isa(ty)) +if (mlir::isa( +ty)) return true; if (const auto vt = mlir::dyn_cast(ty)) diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp index eaba3dfd1105e..bab47924dd719 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp @@ -419,6 +419,15 @@ mlir::Type CIRGenTypes::convertType(QualType type) { case Type::ConstantArray: { const ConstantArrayType *arrTy = cast(ty); mlir::Type elemTy = convertTypeForMem(arrTy->getElementType()); + +// TODO(CIR): In LLVM, "lower arrays of undefined struct type to arrays of +// i8 just to have a concrete type" +if (!builder.isSized(elemTy)) { + cgm.errorNYI(SourceLocation(), "arrays of undefined struct type", type); + resultType = cgm.UInt32Ty; + break; +} + resultType = cir::ArrayType::get(elemTy, arrTy->getSize().getZExtValue()); break; } @@ -432,8 +441,8 @@ mlir::Type CIRGenTypes::convertType(QualType type) { } case Type::Enum: { -const EnumDecl *ED = cast(ty)->getDecl(); -if (auto integerType = ED->getIntegerType(); !integerType.isNull()) +const EnumDecl *ed = cast(ty)->getDecl(); +if (auto integerType = ed->getIntegerType(); !integerType.isNull()) return convertType(integerType); // Return a placeholder 'i32' type. This can be changed later when the // type is defined (see UpdateCompletedType), but is likely to be the diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp index 7b90c1682ec45..26e172a006451 100644 --- a/clang/test/CIR/CodeGen/array.cpp +++ b/clang/test/CIR/CodeGen/array.cpp @@ -473,3 +473,26 @@ void func10(int *a) { // OGCG: %[[ELE:.*]] = getelementptr inbounds i32, ptr %[[TMP_1]], i64 5 // OGCG: %[[TMP_2:.*]] = load i32, ptr %[[ELE]], align 4 // OGCG: store i32 %[[TMP_2]], ptr %[[INIT]], align 4 + +void func11() { int _Complex a[4]; } + +// CIR: %[[ARR:.*]] = cir.alloca !cir.array x 4>, !cir.ptr x 4>>, ["a"] + +// LLVM: %[[ARR:.*]] = alloca [4 x { i32, i32 }], i64 1, align 16 + +// OGCG: %[[ARR:.*]] = alloca [4 x { i32, i32 }], align 16 + +void func12() { + struct Point { +int x; +int y; + }; + + Point a[4]; +} + +// CIR: %[[ARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["a"] + +// LLVM: %[[ARR:.*]] = alloca [4 x %struct.Point], i64 1, align 16 + +// OGCG: %[[ARR:.*]] = alloca [4 x %struct.Point], align 16 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] 493c161 - [SPIRV] Fix ExecutionMode_fragment.ll test (#144116)
Author: Steven Perron Date: 2025-06-13T13:26:26-04:00 New Revision: 493c1612d6f8f7a40d0bf0ba28fb753be83fac1c URL: https://github.com/llvm/llvm-project/commit/493c1612d6f8f7a40d0bf0ba28fb753be83fac1c DIFF: https://github.com/llvm/llvm-project/commit/493c1612d6f8f7a40d0bf0ba28fb753be83fac1c.diff LOG: [SPIRV] Fix ExecutionMode_fragment.ll test (#144116) Fix test broken by https://github.com/llvm/llvm-project/pull/143412. Added: Modified: llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll Removed: diff --git a/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll b/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll index 4fa764fe192d3..aab0ae05753fa 100644 --- a/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll +++ b/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll @@ -4,17 +4,16 @@ ; CHECK-DAG: OpEntryPoint Fragment %[[#entry:]] "main" {{.*}} ; CHECK-DAG: OpExecutionMode %[[#entry]] OriginUpperLeft +@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1 define void @main() #0 { entry: - %0 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false) + %0 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b0) %1 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_1t(target("spirv.VulkanBuffer", [0 x i32], 12, 1) %0, i32 0) store i32 1, ptr addrspace(11) %1, align 4 ret void } -declare target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32, i32, i32, i32, i1) #1 - attributes #0 = { "hlsl.shader"="pixel" } attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [IR2Vec] Scale embeddings once in vocab analysis instead of repetitive scaling (PR #143986)
@@ -104,7 +106,10 @@ MODULE_PASS("lower-ifunc", LowerIFuncPass()) MODULE_PASS("simplify-type-tests", SimplifyTypeTestsPass()) MODULE_PASS("lowertypetests", LowerTypeTestsPass()) MODULE_PASS("fatlto-cleanup", FatLtoCleanup()) -MODULE_PASS("pgo-force-function-attrs", PGOForceFunctionAttrsPass(PGOOpt ? PGOOpt->ColdOptType : PGOOptions::ColdFuncOpt::Default)) +MODULE_PASS("pgo-force-function-attrs", +PGOForceFunctionAttrsPass(PGOOpt svkeerthy wrote: Yeah, will do. Missed the unrelated formatting changes. https://github.com/llvm/llvm-project/pull/143986 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [IR2Vec] Scale embeddings once in vocab analysis instead of repetitive scaling (PR #143986)
@@ -259,32 +306,40 @@ Error IR2VecVocabAnalysis::readVocabulary() { return createFileError(VocabFile, BufOrError.getError()); auto Content = BufOrError.get()->getBuffer(); - json::Path::Root Path(""); + Expected ParsedVocabValue = json::parse(Content); if (!ParsedVocabValue) return ParsedVocabValue.takeError(); - bool Res = json::fromJSON(*ParsedVocabValue, Vocabulary, Path); - if (!Res) -return createStringError(errc::illegal_byte_sequence, - "Unable to parse the vocabulary"); + ir2vec::Vocab OpcodeVocab, TypeVocab, ArgVocab; + unsigned OpcodeDim, TypeDim, ArgDim; + if (auto Err = parseVocabSection("Opcodes", *ParsedVocabValue, OpcodeVocab, svkeerthy wrote: Correct. Will put it in the doc. https://github.com/llvm/llvm-project/pull/143986 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [IR2Vec] Scale embeddings once in vocab analysis instead of repetitive scaling (PR #143986)
@@ -234,6 +237,8 @@ class IR2VecVocabResult { class IR2VecVocabAnalysis : public AnalysisInfoMixin { ir2vec::Vocab Vocabulary; Error readVocabulary(); + Error parseVocabSection(const char *Key, const json::Value ParsedVocabValue, mtrofin wrote: s/const char*/StringRef s/const json::Value/const json::Value& https://github.com/llvm/llvm-project/pull/143986 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [IR2Vec] Scale embeddings once in vocab analysis instead of repetitive scaling (PR #143986)
@@ -104,7 +106,10 @@ MODULE_PASS("lower-ifunc", LowerIFuncPass()) MODULE_PASS("simplify-type-tests", SimplifyTypeTestsPass()) MODULE_PASS("lowertypetests", LowerTypeTestsPass()) MODULE_PASS("fatlto-cleanup", FatLtoCleanup()) -MODULE_PASS("pgo-force-function-attrs", PGOForceFunctionAttrsPass(PGOOpt ? PGOOpt->ColdOptType : PGOOptions::ColdFuncOpt::Default)) +MODULE_PASS("pgo-force-function-attrs", +PGOForceFunctionAttrsPass(PGOOpt mtrofin wrote: can you make the unrelated stylistic changes to this file in a separate patch? https://github.com/llvm/llvm-project/pull/143986 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][OpenMP] Add symbol table scopes for `teams` and `parallel` (PR #144015)
https://github.com/tblah approved this pull request. LGTM. Thanks for the fix https://github.com/llvm/llvm-project/pull/144015 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` (PR #142422)
https://github.com/momchil-velikov edited https://github.com/llvm/llvm-project/pull/142422 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` (PR #142422)
momchil-velikov wrote: Commit message updated. https://github.com/llvm/llvm-project/pull/142422 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen] Limit number of analyzed predecessors (PR #142584)
https://github.com/aengelke updated https://github.com/llvm/llvm-project/pull/142584 >From 4cbc231699c11444cff73ff28b88dc0f3835c752 Mon Sep 17 00:00:00 2001 From: Alexis Engelke Date: Wed, 4 Jun 2025 09:21:02 + Subject: [PATCH 1/2] Move one check to beginning of function Created using spr 1.3.5-bogner --- llvm/lib/CodeGen/MachineBlockPlacement.cpp | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index e96f3f8193b09..2dbabfe345d5e 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -1483,6 +1483,11 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( if (SuccChain.UnscheduledPredecessors == 0) return false; + // Compile-time optimization: runtime is quadratic in the number of + // predecessors. For such uncommon cases, exit early. + if (Succ->pred_size() > PredecessorLimit) +return false; + // There are two basic scenarios here: // - // Case 1: triangular shape CFG (if-then): @@ -1603,11 +1608,6 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb; bool BadCFGConflict = false; - // Compile-time optimization: runtime is quadratic in the number of - // predecessors. For such uncommon cases, exit early. - if (Succ->pred_size() > PredecessorLimit) -return false; - for (MachineBasicBlock *Pred : Succ->predecessors()) { BlockChain *PredChain = BlockToChain[Pred]; if (Pred == Succ || PredChain == &SuccChain || >From e90cfcb5740fc7297e05a876172ad8c25f596a33 Mon Sep 17 00:00:00 2001 From: Alexis Engelke Date: Fri, 13 Jun 2025 15:43:00 + Subject: [PATCH 2/2] Test new command line flag Created using spr 1.3.5-bogner --- llvm/test/CodeGen/RISCV/branch.ll | 49 +++ 1 file changed, 49 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/branch.ll b/llvm/test/CodeGen/RISCV/branch.ll index 578080cd3a240..ed86ca8ca4dd1 100644 --- a/llvm/test/CodeGen/RISCV/branch.ll +++ b/llvm/test/CodeGen/RISCV/branch.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV32I %s +; RUN: llc -mtriple=riscv32 -verify-machineinstrs -block-placement-predecessor-limit=10 < %s \ +; RUN: | FileCheck -check-prefix=RV32I-MBPLIMIT %s define void @foo(i32 %a, ptr %b, i1 %c) nounwind { ; RV32I-LABEL: foo: @@ -48,6 +50,53 @@ define void @foo(i32 %a, ptr %b, i1 %c) nounwind { ; RV32I-NEXT:lw zero, 0(a1) ; RV32I-NEXT: .LBB0_14: # %end ; RV32I-NEXT:ret +; +; RV32I-MBPLIMIT-LABEL: foo: +; RV32I-MBPLIMIT: # %bb.0: +; RV32I-MBPLIMIT-NEXT:lw a3, 0(a1) +; RV32I-MBPLIMIT-NEXT:bne a3, a0, .LBB0_2 +; RV32I-MBPLIMIT-NEXT: .LBB0_1: # %end +; RV32I-MBPLIMIT-NEXT:ret +; RV32I-MBPLIMIT-NEXT: .LBB0_2: # %test2 +; RV32I-MBPLIMIT-NEXT:lw a3, 0(a1) +; RV32I-MBPLIMIT-NEXT:bne a3, a0, .LBB0_1 +; RV32I-MBPLIMIT-NEXT: # %bb.3: # %test3 +; RV32I-MBPLIMIT-NEXT:lw a3, 0(a1) +; RV32I-MBPLIMIT-NEXT:blt a3, a0, .LBB0_1 +; RV32I-MBPLIMIT-NEXT: # %bb.4: # %test4 +; RV32I-MBPLIMIT-NEXT:lw a3, 0(a1) +; RV32I-MBPLIMIT-NEXT:bge a3, a0, .LBB0_1 +; RV32I-MBPLIMIT-NEXT: # %bb.5: # %test5 +; RV32I-MBPLIMIT-NEXT:lw a3, 0(a1) +; RV32I-MBPLIMIT-NEXT:bltu a3, a0, .LBB0_1 +; RV32I-MBPLIMIT-NEXT: # %bb.6: # %test6 +; RV32I-MBPLIMIT-NEXT:lw a3, 0(a1) +; RV32I-MBPLIMIT-NEXT:bgeu a3, a0, .LBB0_1 +; RV32I-MBPLIMIT-NEXT: # %bb.7: # %test7 +; RV32I-MBPLIMIT-NEXT:lw a3, 0(a1) +; RV32I-MBPLIMIT-NEXT:blt a0, a3, .LBB0_1 +; RV32I-MBPLIMIT-NEXT: # %bb.8: # %test8 +; RV32I-MBPLIMIT-NEXT:lw a3, 0(a1) +; RV32I-MBPLIMIT-NEXT:bge a0, a3, .LBB0_1 +; RV32I-MBPLIMIT-NEXT: # %bb.9: # %test9 +; RV32I-MBPLIMIT-NEXT:lw a3, 0(a1) +; RV32I-MBPLIMIT-NEXT:bltu a0, a3, .LBB0_1 +; RV32I-MBPLIMIT-NEXT: # %bb.10: # %test10 +; RV32I-MBPLIMIT-NEXT:lw a3, 0(a1) +; RV32I-MBPLIMIT-NEXT:bgeu a0, a3, .LBB0_1 +; RV32I-MBPLIMIT-NEXT: # %bb.11: # %test11 +; RV32I-MBPLIMIT-NEXT:lw zero, 0(a1) +; RV32I-MBPLIMIT-NEXT:andi a2, a2, 1 +; RV32I-MBPLIMIT-NEXT:bnez a2, .LBB0_1 +; RV32I-MBPLIMIT-NEXT: # %bb.12: # %test12 +; RV32I-MBPLIMIT-NEXT:lw a0, 0(a1) +; RV32I-MBPLIMIT-NEXT:bgez a0, .LBB0_1 +; RV32I-MBPLIMIT-NEXT: # %bb.13: # %test13 +; RV32I-MBPLIMIT-NEXT:lw a0, 0(a1) +; RV32I-MBPLIMIT-NEXT:blez a0, .LBB0_1 +; RV32I-MBPLIMIT-NEXT: # %bb.14: # %test14 +; RV32I-MBPLIMIT-NEXT:lw zero, 0(a1) +; RV32I-MBPLIMIT-NEXT:ret %val1 = load volatile i32, ptr %b %tst1 = icmp eq i32 %val1, %a br i1 %tst1, label %end, label %test2 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/c
[llvm-branch-commits] [llvm] [MLGO][IR2Vec] Integrating IR2Vec with MLInliner (PR #143479)
https://github.com/svkeerthy updated https://github.com/llvm/llvm-project/pull/143479 >From a2bec77ad03e20cd76b6870149863049a96c4f9e Mon Sep 17 00:00:00 2001 From: svkeerthy Date: Tue, 10 Jun 2025 05:40:38 + Subject: [PATCH] [MLIniner][IR2Vec] Integrating IR2Vec with MLInliner --- .../Analysis/FunctionPropertiesAnalysis.h | 26 +++- llvm/include/llvm/Analysis/InlineAdvisor.h| 4 + .../llvm/Analysis/InlineModelFeatureMaps.h| 8 +- llvm/include/llvm/Analysis/MLInlineAdvisor.h | 1 + .../Analysis/FunctionPropertiesAnalysis.cpp | 115 +- llvm/lib/Analysis/InlineAdvisor.cpp | 29 llvm/lib/Analysis/MLInlineAdvisor.cpp | 34 +++- .../FunctionPropertiesAnalysisTest.cpp| 145 -- 8 files changed, 338 insertions(+), 24 deletions(-) diff --git a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h index babb6d9d6cf0c..06dbfc35a5294 100644 --- a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h +++ b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h @@ -15,6 +15,7 @@ #define LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H #include "llvm/ADT/DenseSet.h" +#include "llvm/Analysis/IR2Vec.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/Compiler.h" @@ -32,17 +33,19 @@ class FunctionPropertiesInfo { void updateAggregateStats(const Function &F, const LoopInfo &LI); void reIncludeBB(const BasicBlock &BB); + ir2vec::Embedding FunctionEmbedding = ir2vec::Embedding(0.0); + std::optional IR2VecVocab; + public: LLVM_ABI static FunctionPropertiesInfo getFunctionPropertiesInfo(const Function &F, const DominatorTree &DT, -const LoopInfo &LI); +const LoopInfo &LI, +const IR2VecVocabResult *VocabResult); LLVM_ABI static FunctionPropertiesInfo getFunctionPropertiesInfo(Function &F, FunctionAnalysisManager &FAM); - bool operator==(const FunctionPropertiesInfo &FPI) const { -return std::memcmp(this, &FPI, sizeof(FunctionPropertiesInfo)) == 0; - } + bool operator==(const FunctionPropertiesInfo &FPI) const; bool operator!=(const FunctionPropertiesInfo &FPI) const { return !(*this == FPI); @@ -137,6 +140,19 @@ class FunctionPropertiesInfo { int64_t CallReturnsVectorPointerCount = 0; int64_t CallWithManyArgumentsCount = 0; int64_t CallWithPointerArgumentCount = 0; + + const ir2vec::Embedding &getFunctionEmbedding() const { +return FunctionEmbedding; + } + + const std::optional &getIR2VecVocab() const { +return IR2VecVocab; + } + + // Helper intended to be useful for unittests + void setFunctionEmbeddingForTest(const ir2vec::Embedding &Embedding) { +FunctionEmbedding = Embedding; + } }; // Analysis pass @@ -192,7 +208,7 @@ class FunctionPropertiesUpdater { DominatorTree &getUpdatedDominatorTree(FunctionAnalysisManager &FAM) const; - DenseSet Successors; + DenseSet Successors, CallUsers; // Edges we might potentially need to remove from the dominator tree. SmallVector DomTreeUpdates; diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h b/llvm/include/llvm/Analysis/InlineAdvisor.h index 9d15136e81d10..50ba3c13da70f 100644 --- a/llvm/include/llvm/Analysis/InlineAdvisor.h +++ b/llvm/include/llvm/Analysis/InlineAdvisor.h @@ -331,6 +331,10 @@ class InlineAdvisorAnalysis : public AnalysisInfoMixin { }; Result run(Module &M, ModuleAnalysisManager &MAM) { return Result(M, MAM); } + +private: + static bool initializeIR2VecVocabIfRequested(Module &M, + ModuleAnalysisManager &MAM); }; /// Printer pass for the InlineAdvisorAnalysis results. diff --git a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h index 961d5091bf9f3..a166621243cad 100644 --- a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h +++ b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h @@ -142,6 +142,12 @@ enum class FeatureIndex : size_t { INLINE_FEATURE_ITERATOR(POPULATE_INDICES) #undef POPULATE_INDICES +// IR2Vec embeddings +// Dimensions of embeddings are not known in the compile time (until vocab is +// read). Hence macros cannot be used here. + callee_embedding, + caller_embedding, + NumberOfFeatures }; // clang-format on @@ -154,7 +160,7 @@ inlineCostFeatureToMlFeature(InlineCostFeatureIndex Feature) { constexpr size_t NumberOfFeatures = static_cast(FeatureIndex::NumberOfFeatures); -LLVM_ABI extern const std::vector FeatureMap; +LLVM_ABI extern std::vector FeatureMap; LLVM_ABI extern const char *const DecisionName; LLVM_ABI extern const TensorSpec InlineDecisionSpec; diff --git a/llvm/include/llvm/Analysis/MLInlineAdvisor.h b/llvm/include/llvm/Analysis/MLInlineAdvisor.h index 580dd5e95d760..8262dd0846ede 100644 --- a/llvm/include/llvm/Analysis/MLInlin
[llvm-branch-commits] [llvm] [IR2Vec] Simplifying creation of Embedder (PR #143999)
https://github.com/svkeerthy updated https://github.com/llvm/llvm-project/pull/143999 >From d71dd503f4794abf8a396ddb8a5deeafe0d75c83 Mon Sep 17 00:00:00 2001 From: svkeerthy Date: Thu, 12 Jun 2025 23:54:10 + Subject: [PATCH] Simplifying creation of Embedder --- llvm/docs/MLGO.rst| 7 +-- llvm/include/llvm/Analysis/IR2Vec.h | 4 +- .../Analysis/FunctionPropertiesAnalysis.cpp | 10 ++--- llvm/lib/Analysis/IR2Vec.cpp | 17 +++ .../FunctionPropertiesAnalysisTest.cpp| 7 ++- llvm/unittests/Analysis/IR2VecTest.cpp| 44 +++ 6 files changed, 33 insertions(+), 56 deletions(-) diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst index 28095447f6a5a..0b849f3382f63 100644 --- a/llvm/docs/MLGO.rst +++ b/llvm/docs/MLGO.rst @@ -482,14 +482,9 @@ embeddings can be computed and accessed via an ``ir2vec::Embedder`` instance. // Assuming F is an llvm::Function& // For example, using IR2VecKind::Symbolic: - Expected> EmbOrErr = + std::unique_ptr Emb = ir2vec::Embedder::create(IR2VecKind::Symbolic, F, Vocabulary); - if (auto Err = EmbOrErr.takeError()) { -// Handle error in embedder creation -return; - } - std::unique_ptr Emb = std::move(*EmbOrErr); 3. **Compute and Access Embeddings**: Call ``getFunctionVector()`` to get the embedding for the function. diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h index 2a7a6edda70a8..06312562060aa 100644 --- a/llvm/include/llvm/Analysis/IR2Vec.h +++ b/llvm/include/llvm/Analysis/IR2Vec.h @@ -170,8 +170,8 @@ class Embedder { virtual ~Embedder() = default; /// Factory method to create an Embedder object. - static Expected> - create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary); + static std::unique_ptr create(IR2VecKind Mode, const Function &F, + const Vocab &Vocabulary); /// Returns a map containing instructions and the corresponding embeddings for /// the function F if it has been computed. If not, it computes the embeddings diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp index 29d3aaf46dc06..dd4eb7f0df053 100644 --- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp +++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp @@ -204,16 +204,12 @@ void FunctionPropertiesInfo::updateForBB(const BasicBlock &BB, // We instantiate the IR2Vec embedder each time, as having an unique // pointer to the embedder as member of the class would make it // non-copyable. Instantiating the embedder in itself is not costly. -auto EmbOrErr = ir2vec::Embedder::create(IR2VecKind::Symbolic, +auto Embedder = ir2vec::Embedder::create(IR2VecKind::Symbolic, *BB.getParent(), *IR2VecVocab); -if (Error Err = EmbOrErr.takeError()) { - handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) { -BB.getContext().emitError("Error creating IR2Vec embeddings: " + - EI.message()); - }); +if (!Embedder) { + BB.getContext().emitError("Error creating IR2Vec embeddings"); return; } -auto Embedder = std::move(*EmbOrErr); const auto &BBEmbedding = Embedder->getBBVector(BB); // Subtract BBEmbedding from Function embedding if the direction is -1, // and add it if the direction is +1. diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp index 7ff7acebedf4e..27cc2a4109879 100644 --- a/llvm/lib/Analysis/IR2Vec.cpp +++ b/llvm/lib/Analysis/IR2Vec.cpp @@ -123,13 +123,14 @@ Embedder::Embedder(const Function &F, const Vocab &Vocabulary) Dimension(Vocabulary.begin()->second.size()), OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight) {} -Expected> -Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) { +std::unique_ptr Embedder::create(IR2VecKind Mode, const Function &F, + const Vocab &Vocabulary) { switch (Mode) { case IR2VecKind::Symbolic: return std::make_unique(F, Vocabulary); } - return make_error("Unknown IR2VecKind", errc::invalid_argument); + llvm_unreachable("Unknown IR2Vec kind"); + return nullptr; } // FIXME: Currently lookups are string based. Use numeric Keys @@ -384,17 +385,13 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M, auto Vocab = IR2VecVocabResult.getVocabulary(); for (Function &F : M) { -Expected> EmbOrErr = +std::unique_ptr Emb = Embedder::create(IR2VecKind::Symbolic, F, Vocab); -if (auto Err = EmbOrErr.takeError()) { - handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) { -OS << "Error creating IR2Vec embeddings: " << EI.message() << "\n"; - }); +if (!Emb) { + OS << "Error creating I
[llvm-branch-commits] [llvm] [MLGO][IR2Vec] Integrating IR2Vec with MLInliner (PR #143479)
https://github.com/svkeerthy updated https://github.com/llvm/llvm-project/pull/143479 >From a2bec77ad03e20cd76b6870149863049a96c4f9e Mon Sep 17 00:00:00 2001 From: svkeerthy Date: Tue, 10 Jun 2025 05:40:38 + Subject: [PATCH] [MLIniner][IR2Vec] Integrating IR2Vec with MLInliner --- .../Analysis/FunctionPropertiesAnalysis.h | 26 +++- llvm/include/llvm/Analysis/InlineAdvisor.h| 4 + .../llvm/Analysis/InlineModelFeatureMaps.h| 8 +- llvm/include/llvm/Analysis/MLInlineAdvisor.h | 1 + .../Analysis/FunctionPropertiesAnalysis.cpp | 115 +- llvm/lib/Analysis/InlineAdvisor.cpp | 29 llvm/lib/Analysis/MLInlineAdvisor.cpp | 34 +++- .../FunctionPropertiesAnalysisTest.cpp| 145 -- 8 files changed, 338 insertions(+), 24 deletions(-) diff --git a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h index babb6d9d6cf0c..06dbfc35a5294 100644 --- a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h +++ b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h @@ -15,6 +15,7 @@ #define LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H #include "llvm/ADT/DenseSet.h" +#include "llvm/Analysis/IR2Vec.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/Compiler.h" @@ -32,17 +33,19 @@ class FunctionPropertiesInfo { void updateAggregateStats(const Function &F, const LoopInfo &LI); void reIncludeBB(const BasicBlock &BB); + ir2vec::Embedding FunctionEmbedding = ir2vec::Embedding(0.0); + std::optional IR2VecVocab; + public: LLVM_ABI static FunctionPropertiesInfo getFunctionPropertiesInfo(const Function &F, const DominatorTree &DT, -const LoopInfo &LI); +const LoopInfo &LI, +const IR2VecVocabResult *VocabResult); LLVM_ABI static FunctionPropertiesInfo getFunctionPropertiesInfo(Function &F, FunctionAnalysisManager &FAM); - bool operator==(const FunctionPropertiesInfo &FPI) const { -return std::memcmp(this, &FPI, sizeof(FunctionPropertiesInfo)) == 0; - } + bool operator==(const FunctionPropertiesInfo &FPI) const; bool operator!=(const FunctionPropertiesInfo &FPI) const { return !(*this == FPI); @@ -137,6 +140,19 @@ class FunctionPropertiesInfo { int64_t CallReturnsVectorPointerCount = 0; int64_t CallWithManyArgumentsCount = 0; int64_t CallWithPointerArgumentCount = 0; + + const ir2vec::Embedding &getFunctionEmbedding() const { +return FunctionEmbedding; + } + + const std::optional &getIR2VecVocab() const { +return IR2VecVocab; + } + + // Helper intended to be useful for unittests + void setFunctionEmbeddingForTest(const ir2vec::Embedding &Embedding) { +FunctionEmbedding = Embedding; + } }; // Analysis pass @@ -192,7 +208,7 @@ class FunctionPropertiesUpdater { DominatorTree &getUpdatedDominatorTree(FunctionAnalysisManager &FAM) const; - DenseSet Successors; + DenseSet Successors, CallUsers; // Edges we might potentially need to remove from the dominator tree. SmallVector DomTreeUpdates; diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h b/llvm/include/llvm/Analysis/InlineAdvisor.h index 9d15136e81d10..50ba3c13da70f 100644 --- a/llvm/include/llvm/Analysis/InlineAdvisor.h +++ b/llvm/include/llvm/Analysis/InlineAdvisor.h @@ -331,6 +331,10 @@ class InlineAdvisorAnalysis : public AnalysisInfoMixin { }; Result run(Module &M, ModuleAnalysisManager &MAM) { return Result(M, MAM); } + +private: + static bool initializeIR2VecVocabIfRequested(Module &M, + ModuleAnalysisManager &MAM); }; /// Printer pass for the InlineAdvisorAnalysis results. diff --git a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h index 961d5091bf9f3..a166621243cad 100644 --- a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h +++ b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h @@ -142,6 +142,12 @@ enum class FeatureIndex : size_t { INLINE_FEATURE_ITERATOR(POPULATE_INDICES) #undef POPULATE_INDICES +// IR2Vec embeddings +// Dimensions of embeddings are not known in the compile time (until vocab is +// read). Hence macros cannot be used here. + callee_embedding, + caller_embedding, + NumberOfFeatures }; // clang-format on @@ -154,7 +160,7 @@ inlineCostFeatureToMlFeature(InlineCostFeatureIndex Feature) { constexpr size_t NumberOfFeatures = static_cast(FeatureIndex::NumberOfFeatures); -LLVM_ABI extern const std::vector FeatureMap; +LLVM_ABI extern std::vector FeatureMap; LLVM_ABI extern const char *const DecisionName; LLVM_ABI extern const TensorSpec InlineDecisionSpec; diff --git a/llvm/include/llvm/Analysis/MLInlineAdvisor.h b/llvm/include/llvm/Analysis/MLInlineAdvisor.h index 580dd5e95d760..8262dd0846ede 100644 --- a/llvm/include/llvm/Analysis/MLInlin
[llvm-branch-commits] [llvm] [IR2Vec] Simplifying creation of Embedder (PR #143999)
https://github.com/svkeerthy updated https://github.com/llvm/llvm-project/pull/143999 >From d71dd503f4794abf8a396ddb8a5deeafe0d75c83 Mon Sep 17 00:00:00 2001 From: svkeerthy Date: Thu, 12 Jun 2025 23:54:10 + Subject: [PATCH] Simplifying creation of Embedder --- llvm/docs/MLGO.rst| 7 +-- llvm/include/llvm/Analysis/IR2Vec.h | 4 +- .../Analysis/FunctionPropertiesAnalysis.cpp | 10 ++--- llvm/lib/Analysis/IR2Vec.cpp | 17 +++ .../FunctionPropertiesAnalysisTest.cpp| 7 ++- llvm/unittests/Analysis/IR2VecTest.cpp| 44 +++ 6 files changed, 33 insertions(+), 56 deletions(-) diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst index 28095447f6a5a..0b849f3382f63 100644 --- a/llvm/docs/MLGO.rst +++ b/llvm/docs/MLGO.rst @@ -482,14 +482,9 @@ embeddings can be computed and accessed via an ``ir2vec::Embedder`` instance. // Assuming F is an llvm::Function& // For example, using IR2VecKind::Symbolic: - Expected> EmbOrErr = + std::unique_ptr Emb = ir2vec::Embedder::create(IR2VecKind::Symbolic, F, Vocabulary); - if (auto Err = EmbOrErr.takeError()) { -// Handle error in embedder creation -return; - } - std::unique_ptr Emb = std::move(*EmbOrErr); 3. **Compute and Access Embeddings**: Call ``getFunctionVector()`` to get the embedding for the function. diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h index 2a7a6edda70a8..06312562060aa 100644 --- a/llvm/include/llvm/Analysis/IR2Vec.h +++ b/llvm/include/llvm/Analysis/IR2Vec.h @@ -170,8 +170,8 @@ class Embedder { virtual ~Embedder() = default; /// Factory method to create an Embedder object. - static Expected> - create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary); + static std::unique_ptr create(IR2VecKind Mode, const Function &F, + const Vocab &Vocabulary); /// Returns a map containing instructions and the corresponding embeddings for /// the function F if it has been computed. If not, it computes the embeddings diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp index 29d3aaf46dc06..dd4eb7f0df053 100644 --- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp +++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp @@ -204,16 +204,12 @@ void FunctionPropertiesInfo::updateForBB(const BasicBlock &BB, // We instantiate the IR2Vec embedder each time, as having an unique // pointer to the embedder as member of the class would make it // non-copyable. Instantiating the embedder in itself is not costly. -auto EmbOrErr = ir2vec::Embedder::create(IR2VecKind::Symbolic, +auto Embedder = ir2vec::Embedder::create(IR2VecKind::Symbolic, *BB.getParent(), *IR2VecVocab); -if (Error Err = EmbOrErr.takeError()) { - handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) { -BB.getContext().emitError("Error creating IR2Vec embeddings: " + - EI.message()); - }); +if (!Embedder) { + BB.getContext().emitError("Error creating IR2Vec embeddings"); return; } -auto Embedder = std::move(*EmbOrErr); const auto &BBEmbedding = Embedder->getBBVector(BB); // Subtract BBEmbedding from Function embedding if the direction is -1, // and add it if the direction is +1. diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp index 7ff7acebedf4e..27cc2a4109879 100644 --- a/llvm/lib/Analysis/IR2Vec.cpp +++ b/llvm/lib/Analysis/IR2Vec.cpp @@ -123,13 +123,14 @@ Embedder::Embedder(const Function &F, const Vocab &Vocabulary) Dimension(Vocabulary.begin()->second.size()), OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight) {} -Expected> -Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) { +std::unique_ptr Embedder::create(IR2VecKind Mode, const Function &F, + const Vocab &Vocabulary) { switch (Mode) { case IR2VecKind::Symbolic: return std::make_unique(F, Vocabulary); } - return make_error("Unknown IR2VecKind", errc::invalid_argument); + llvm_unreachable("Unknown IR2Vec kind"); + return nullptr; } // FIXME: Currently lookups are string based. Use numeric Keys @@ -384,17 +385,13 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M, auto Vocab = IR2VecVocabResult.getVocabulary(); for (Function &F : M) { -Expected> EmbOrErr = +std::unique_ptr Emb = Embedder::create(IR2VecKind::Symbolic, F, Vocab); -if (auto Err = EmbOrErr.takeError()) { - handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) { -OS << "Error creating IR2Vec embeddings: " << EI.message() << "\n"; - }); +if (!Emb) { + OS << "Error creating I
[llvm-branch-commits] [llvm] 69ba6fa - Revert "[PowerPC][NFC] Pre-commit test case for checking whether `mtvsrbmi` …"
Author: zhijian lin Date: 2025-06-13T09:24:56-04:00 New Revision: 69ba6fa610e19baa1d0d18f04a27cb5f45db1711 URL: https://github.com/llvm/llvm-project/commit/69ba6fa610e19baa1d0d18f04a27cb5f45db1711 DIFF: https://github.com/llvm/llvm-project/commit/69ba6fa610e19baa1d0d18f04a27cb5f45db1711.diff LOG: Revert "[PowerPC][NFC] Pre-commit test case for checking whether `mtvsrbmi` …" This reverts commit 9c2e0bd59ce0438fcad61b0468fd939c6282d048. Added: Modified: Removed: llvm/test/CodeGen/PowerPC/mtvsrbmi.ll diff --git a/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll deleted file mode 100644 index 7ed57c300ec71..0 --- a/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll +++ /dev/null @@ -1,44 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; Verify whether the generated assembly for the following function includes the mtvsrbmi instruction. -; vector unsigned char v00FF() -; { -; vector unsigned char x = { 0xFF, 0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 }; -; return x; -; } - -; RUN: llc < %s -ppc-asm-full-reg-names -mtriple=powerpc-ibm-aix -mcpu=pwr10 -verify-machineinstrs \ -; RUN: | FileCheck %s --check-prefix=CHECK - -define dso_local noundef range(i8 -1, 1) <16 x i8> @_Z5v00FFv() { -; CHECK-LABEL: _Z5v00FFv: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT:lwz r3, L..C0(r2) # %const.0 -; CHECK-NEXT:lxv vs34, 0(r3) -; CHECK-NEXT:blr -entry: - ret <16 x i8> -} - -; CHECK: L..CPI0_0: -; CHECK-NEXT: .byte 255 # 0xff -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 -; CHECK-NEXT: .byte 0 # 0x0 - -; CHECK: ._Z5v00FFv: -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: lwz r3, L..C0(r2) -; CHECK-NEXT: lxv vs34, 0(r3) -; CHECK-NEXT: blr ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR] Legalize certain `vector.transfer_read` ops of scalable vectors (PR #143146)
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/143146 >From 493955781f28b8b6caaeff1b45f7b7a06b43086c Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Wed, 14 May 2025 09:03:49 + Subject: [PATCH 1/3] [MLIR] Legalize certain `vector.transfer_read` ops of scalable vectors THis patch add a transform of `transfer_read` operation to change the vector type to one that can be mapped to an LLVM type. This is done by collapsing trailing dimensions so we obtain a vector type with a single scalable dimension in the rightmost position. --- .../Transforms/LegalizeVectorStorage.cpp | 110 - .../ArmSVE/legalize-transfer-read.mlir| 226 ++ .../transfer-read-scalable-not-rightmost.mlir | 72 ++ 3 files changed, 407 insertions(+), 1 deletion(-) create mode 100644 mlir/test/Dialect/ArmSVE/legalize-transfer-read.mlir create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/transfer-read-scalable-not-rightmost.mlir diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp index d2ac850a5f70b..f16d33c004fec 100644 --- a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp +++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp @@ -298,6 +298,113 @@ struct LegalizeSVEMaskLoadConversion : public OpRewritePattern { } }; +/// Transforms a `transfer_read` operation so it reads vector of a type that +/// can be mapped to an LLVM type. This is done by collapsing trailing +/// dimensions so we obtain a vector type with a single scalable dimension in +/// the rightmost position. +/// +/// Example: +/// ``` +/// %v = vector.transfer_read %M[%i, %j, %c0, %c0], %c0_i8 +/// {in_bounds = [false, true, true, true]} +/// : memref, vector<2x[4]x2x8xi8> +/// ``` +/// is rewriten to +/// ``` +/// %collapse_shape = memref.collapse_shape %M [[0], [1, 2, 3]] +/// : memref into memref +/// %0 = vector.transfer_read %collapse_shape[%i, %j], %c0_i8 +/// {in_bounds = [false, true]} +/// : memref, vector<2x[64]xi8> +/// %1 = vector.shape_cast %0 : vector<2x[64]xi8> to vector<2x[4]x2x8xi8> +/// ``` +struct LegalizeTransferRead : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(vector::TransferReadOp readOp, +PatternRewriter &rewriter) const override { + +if (!readOp.getPermutationMap().isMinorIdentity()) + return rewriter.notifyMatchFailure(readOp, "non-identity permutation"); + +// We handle transfers of vectors with rank >= 2 and a single scalable +// dimension. +VectorType origVT = readOp.getVectorType(); +ArrayRef origScalableDims = origVT.getScalableDims(); +const int64_t origVRank = origVT.getRank(); +if (origVRank < 2 || llvm::count(origScalableDims, true) != 1) + return rewriter.notifyMatchFailure(readOp, "wrong dimensions"); + +// Number of trailing dimensions to collapse, including the scalable +// dimension. Nothing to do if the single scalable dimension is already the +// last one. +const int64_t numCollapseDims = std::distance( +llvm::find(origScalableDims, true), origScalableDims.end()); +if (numCollapseDims < 2) + return rewriter.notifyMatchFailure(readOp, + "scalable dimension is trailing"); + +// We want a simple memref (not a tensor) with contiguous elements for at +// least all the trailing dimensions up to and including the scalable one. +auto memTy = dyn_cast(readOp.getBase().getType()); +if (!(memTy && memTy.areTrailingDimsContiguous(numCollapseDims))) + return rewriter.notifyMatchFailure( + readOp, "non-contiguous memref dimensions to collapse"); + +// The collapsed dimensions (excluding the scalable one) of the vector and +// the memref must match and the corresponding indices must be in-bounds (it +// follows these indices would be zero). This guarantees that the operation +// transfers a contiguous block. +if (!llvm::equal(memTy.getShape().take_back(numCollapseDims - 1), + origVT.getShape().take_back(numCollapseDims - 1))) + return rewriter.notifyMatchFailure( + readOp, "memref and vector dimensions do not match"); + +SmallVector origInBounds = readOp.getInBoundsValues(); +if (!llvm::all_of( +ArrayRef(origInBounds).take_back(numCollapseDims - 1), +[](bool v) { return v; })) + return rewriter.notifyMatchFailure(readOp, + "out-if-bounds index to collapse"); + +// Collapse the trailing dimensions of the memref. +SmallVector reassoc; +for (int64_t i = 0; i < memTy.getRank() - numCollapseDims + 1; ++i) + reassoc.push_back({i}); +for (int64_t i = memTy.getRank() - numCollapseDims + 1; i < memTy.getRank(); + ++i) + reassoc.
[llvm-branch-commits] [mlir] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` (PR #142422)
@@ -203,21 +206,21 @@ func.func @transfer_read_dynamic_dim_to_flatten( return %res : vector<1x2x6xi32> } -// CHECK: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (s0 * 24 + s1 * 6)> +// CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 24 + s1 * 6)> // CHECK-LABEL: func.func @transfer_read_dynamic_dim_to_flatten // CHECK-SAME:%[[IDX_1:arg0]] // CHECK-SAME:%[[IDX_2:arg1]] // CHECK-SAME:%[[MEM:arg2]] -// CHECK: %[[C0_I32:.*]] = arith.constant 0 : i32 newling wrote: Makes sense, thanks https://github.com/llvm/llvm-project/pull/142422 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] [libc++][C++03] Fix a bunch of random tests (PR #144117)
https://github.com/philnik777 created https://github.com/llvm/llvm-project/pull/144117 This fixes/removes a bunch of random tests. They all failed in relatively simple to fix ways. Specificially (all inside `libcxx/test/libcxx-03`): - `utilities/template.bitset/includes.pass.cpp`: the header guards have different names now (guard names fixed) - `utilities/meta/is_referenceable.compile.pass.cpp`: The name changed from `__libcpp_is_referenceable` (reverted name) - `utilities/function.objects/refwrap/desugars_to.compile.pass.cpp`: Optimization has been added after the header split (test removed) - `type_traits/is_replaceable.compile.pass.cpp`: `__is_replacable_v` has been added after the header split (test removed) - `type_traits/is_constant_evaluated.pass.cpp`: Ran C++11 code accidentally (C++11 test parts removed) - `type_traits/desugars_to.compile.pass.cpp`: Optimization has been added after the header split (test removed) - `numerics/bit.ops.pass.cpp`: Tried to include header which doesn't exist (removed include and related code which wasn't executed in C++03) - `experimental/fexperimental-library.compile.pass.cpp`: This test is irrelevant for C++03, since there are no C++03 experimental features (test removed) - `containers/container_traits.compile.pass.cpp`: `container_traits` have been introduced after the header split (test removed) >From 94255420a3a9e470973d3f3d4f7bed76bef39d23 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Fri, 13 Jun 2025 18:51:26 +0200 Subject: [PATCH] [libc++][C++03] Fix a bunch of random tests --- .../container_traits.compile.pass.cpp | 165 - .../fexperimental-library.compile.pass.cpp| 31 -- .../bounded_iter/comparison.pass.cpp | 4 +- .../test/libcxx-03/numerics/bit.ops.pass.cpp | 12 +- .../type_traits/desugars_to.compile.pass.cpp | 42 --- .../is_constant_evaluated.pass.cpp| 8 +- .../is_replaceable.compile.pass.cpp | 313 -- .../refwrap/desugars_to.compile.pass.cpp | 36 -- .../meta/is_referenceable.compile.pass.cpp| 230 +++-- .../template.bitset/includes.pass.cpp | 8 +- 10 files changed, 121 insertions(+), 728 deletions(-) delete mode 100644 libcxx/test/libcxx-03/containers/container_traits.compile.pass.cpp delete mode 100644 libcxx/test/libcxx-03/experimental/fexperimental-library.compile.pass.cpp delete mode 100644 libcxx/test/libcxx-03/type_traits/desugars_to.compile.pass.cpp delete mode 100644 libcxx/test/libcxx-03/type_traits/is_replaceable.compile.pass.cpp delete mode 100644 libcxx/test/libcxx-03/utilities/function.objects/refwrap/desugars_to.compile.pass.cpp diff --git a/libcxx/test/libcxx-03/containers/container_traits.compile.pass.cpp b/libcxx/test/libcxx-03/containers/container_traits.compile.pass.cpp deleted file mode 100644 index 22be217487951..0 --- a/libcxx/test/libcxx-03/containers/container_traits.compile.pass.cpp +++ /dev/null @@ -1,165 +0,0 @@ -//===--===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===--===// -// -// <__type_traits/container_traits.h> -// - -// XFAIL: FROZEN-CXX03-HEADERS-FIXME - -#include <__type_traits/container_traits.h> - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "test_allocator.h" -#include "test_macros.h" -#include "MoveOnly.h" - -struct ThrowOnMove { - ThrowOnMove(); - ThrowOnMove(const ThrowOnMove&) TEST_NOEXCEPT_COND(false); - ThrowOnMove(ThrowOnMove&&) TEST_NOEXCEPT_COND(false); - ThrowOnMove& operator=(ThrowOnMove&&) TEST_NOEXCEPT_COND(false); - ThrowOnMove& operator=(const ThrowOnMove&) TEST_NOEXCEPT_COND(false); - - bool operator<(ThrowOnMove const&) const; - bool operator==(ThrowOnMove const&) const; -}; - -struct NonCopyThrowOnMove { - NonCopyThrowOnMove(); - NonCopyThrowOnMove(NonCopyThrowOnMove&&) TEST_NOEXCEPT_COND(false); - NonCopyThrowOnMove(const NonCopyThrowOnMove&) = delete; - NonCopyThrowOnMove& operator=(NonCopyThrowOnMove&&) TEST_NOEXCEPT_COND(false); - NonCopyThrowOnMove& operator=(const NonCopyThrowOnMove&) = delete; - - bool operator<(NonCopyThrowOnMove const&) const; - bool operator==(NonCopyThrowOnMove const&) const; -}; - -struct ThrowingHash { - template - std::size_t operator()(const T&) const TEST_NOEXCEPT_COND(false); -}; - -struct NoThrowHash { - template - std::size_t operator()(const T&) const TEST_NOEXCEPT; -}; - -template -void check() { - static_assert( - std::__container_traits::__emplacement_has_strong_exception_safety_guarantee == Expected, ""); -} - -void test() { - check >(); - check > >(); - check >(); - check >(); - check >(); - - check >(
[llvm-branch-commits] [mlir] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` (PR #142422)
@@ -630,7 +639,10 @@ class FlattenContiguousRowMajorTransferReadPattern if (transferReadOp.getMask()) return failure(); -int64_t firstDimToCollapse = sourceType.getRank() - vectorType.getRank(); newling wrote: Looks good, thanks! https://github.com/llvm/llvm-project/pull/142422 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` (PR #142422)
@@ -630,7 +639,10 @@ class FlattenContiguousRowMajorTransferReadPattern if (transferReadOp.getMask()) return failure(); -int64_t firstDimToCollapse = sourceType.getRank() - vectorType.getRank(); newling wrote: > For memrefs with dynamic dimensions and no strides or maps, e.g. > memref<2x?x2xi8> Makes sense yes https://github.com/llvm/llvm-project/pull/142422 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] [libc++][C++03] Fix a bunch of random tests (PR #144117)
github-actions[bot] wrote: :warning: C/C++ code formatter, clang-format found issues in your code. :warning: You can test this locally with the following command: ``bash git-clang-format --diff HEAD~1 HEAD --extensions cpp -- libcxx/test/libcxx-03/iterators/bounded_iter/comparison.pass.cpp libcxx/test/libcxx-03/numerics/bit.ops.pass.cpp libcxx/test/libcxx-03/type_traits/is_constant_evaluated.pass.cpp libcxx/test/libcxx-03/utilities/meta/is_referenceable.compile.pass.cpp libcxx/test/libcxx-03/utilities/template.bitset/includes.pass.cpp `` View the diff from clang-format here. ``diff diff --git a/libcxx/test/libcxx-03/utilities/template.bitset/includes.pass.cpp b/libcxx/test/libcxx-03/utilities/template.bitset/includes.pass.cpp index f6e6960c9..d69a4b3b8 100644 --- a/libcxx/test/libcxx-03/utilities/template.bitset/includes.pass.cpp +++ b/libcxx/test/libcxx-03/utilities/template.bitset/includes.pass.cpp @@ -13,15 +13,15 @@ #include "test_macros.h" #ifndef _LIBCPP___CXX03_STRING -#error has not been included +# error has not been included #endif #ifndef _LIBCPP___CXX03_STDEXCEPT -#error has not been included +# error has not been included #endif #ifndef _LIBCPP___CXX03_IOSFWD -#error has not been included +# error has not been included #endif int main(int, char**) `` https://github.com/llvm/llvm-project/pull/144117 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [IR2Vec] Simplifying creation of Embedder (PR #143999)
https://github.com/svkeerthy updated https://github.com/llvm/llvm-project/pull/143999 >From 0d921416a0f81e5634705dc9dfc5363d721a55bf Mon Sep 17 00:00:00 2001 From: svkeerthy Date: Thu, 12 Jun 2025 23:54:10 + Subject: [PATCH] Simplifying creation of Embedder --- llvm/docs/MLGO.rst| 7 +-- llvm/include/llvm/Analysis/IR2Vec.h | 4 +- .../Analysis/FunctionPropertiesAnalysis.cpp | 10 ++--- llvm/lib/Analysis/IR2Vec.cpp | 17 +++ .../FunctionPropertiesAnalysisTest.cpp| 7 ++- llvm/unittests/Analysis/IR2VecTest.cpp| 44 +++ 6 files changed, 33 insertions(+), 56 deletions(-) diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst index 4f8fb3f59ca19..e7bba9995b75b 100644 --- a/llvm/docs/MLGO.rst +++ b/llvm/docs/MLGO.rst @@ -479,14 +479,9 @@ embeddings can be computed and accessed via an ``ir2vec::Embedder`` instance. // Assuming F is an llvm::Function& // For example, using IR2VecKind::Symbolic: - Expected> EmbOrErr = + std::unique_ptr Emb = ir2vec::Embedder::create(IR2VecKind::Symbolic, F, Vocabulary); - if (auto Err = EmbOrErr.takeError()) { -// Handle error in embedder creation -return; - } - std::unique_ptr Emb = std::move(*EmbOrErr); 3. **Compute and Access Embeddings**: Call ``getFunctionVector()`` to get the embedding for the function. diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h index f1aaf4cd2e013..6efa6eac56af9 100644 --- a/llvm/include/llvm/Analysis/IR2Vec.h +++ b/llvm/include/llvm/Analysis/IR2Vec.h @@ -170,8 +170,8 @@ class Embedder { virtual ~Embedder() = default; /// Factory method to create an Embedder object. - static Expected> - create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary); + static std::unique_ptr create(IR2VecKind Mode, const Function &F, + const Vocab &Vocabulary); /// Returns a map containing instructions and the corresponding embeddings for /// the function F if it has been computed. If not, it computes the embeddings diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp index 29d3aaf46dc06..dd4eb7f0df053 100644 --- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp +++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp @@ -204,16 +204,12 @@ void FunctionPropertiesInfo::updateForBB(const BasicBlock &BB, // We instantiate the IR2Vec embedder each time, as having an unique // pointer to the embedder as member of the class would make it // non-copyable. Instantiating the embedder in itself is not costly. -auto EmbOrErr = ir2vec::Embedder::create(IR2VecKind::Symbolic, +auto Embedder = ir2vec::Embedder::create(IR2VecKind::Symbolic, *BB.getParent(), *IR2VecVocab); -if (Error Err = EmbOrErr.takeError()) { - handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) { -BB.getContext().emitError("Error creating IR2Vec embeddings: " + - EI.message()); - }); +if (!Embedder) { + BB.getContext().emitError("Error creating IR2Vec embeddings"); return; } -auto Embedder = std::move(*EmbOrErr); const auto &BBEmbedding = Embedder->getBBVector(BB); // Subtract BBEmbedding from Function embedding if the direction is -1, // and add it if the direction is +1. diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp index de9c2db9531e8..308c3d86a7668 100644 --- a/llvm/lib/Analysis/IR2Vec.cpp +++ b/llvm/lib/Analysis/IR2Vec.cpp @@ -123,13 +123,14 @@ Embedder::Embedder(const Function &F, const Vocab &Vocabulary) Dimension(Vocabulary.begin()->second.size()), OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight) {} -Expected> -Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) { +std::unique_ptr Embedder::create(IR2VecKind Mode, const Function &F, + const Vocab &Vocabulary) { switch (Mode) { case IR2VecKind::Symbolic: return std::make_unique(F, Vocabulary); } - return make_error("Unknown IR2VecKind", errc::invalid_argument); + llvm_unreachable("Unknown IR2Vec kind"); + return nullptr; } // FIXME: Currently lookups are string based. Use numeric Keys @@ -388,17 +389,13 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M, auto Vocab = IR2VecVocabResult.getVocabulary(); for (Function &F : M) { -Expected> EmbOrErr = +std::unique_ptr Emb = Embedder::create(IR2VecKind::Symbolic, F, Vocab); -if (auto Err = EmbOrErr.takeError()) { - handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) { -OS << "Error creating IR2Vec embeddings: " << EI.message() << "\n"; - }); +if (!Emb) { + OS << "Error creating I
[llvm-branch-commits] [llvm] WebAssembly: Stop directly using RuntimeLibcalls.def (PR #143054)
@@ -528,23 +528,20 @@ RuntimeLibcallSignatureTable &getRuntimeLibcallSignatures() { // constructor for use with a static variable struct StaticLibcallNameMap { StringMap Map; - StaticLibcallNameMap() { -static const std::pair NameLibcalls[] = { -#define HANDLE_LIBCALL(code, name) {(const char *)name, RTLIB::code}, -#include "llvm/IR/RuntimeLibcalls.def" -#undef HANDLE_LIBCALL -}; -for (const auto &NameLibcall : NameLibcalls) { - if (NameLibcall.first != nullptr && - getRuntimeLibcallSignatures().Table[NameLibcall.second] != - unsupported) { -assert(!Map.contains(NameLibcall.first) && + StaticLibcallNameMap(const Triple &TT) { +// FIXME: This is broken if there are ever different triples compiled with +// different libcalls. +RTLIB::RuntimeLibcallsInfo RTCI(TT); +for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) { + RTLIB::Libcall LC = static_cast(I); + const char *NameLibcall = RTCI.getLibcallName(LC); + if (NameLibcall != nullptr && + getRuntimeLibcallSignatures().Table[LC] != unsupported) { +assert(!Map.contains(NameLibcall) && "duplicate libcall names in name map"); -Map[NameLibcall.first] = NameLibcall.second; +Map[NameLibcall] = LC; } } - -Map["emscripten_return_address"] = RTLIB::RETURN_ADDRESS; tlively wrote: How is this handled in the new version? https://github.com/llvm/llvm-project/pull/143054 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CodeGen][NFC] Fix quadratic c-t for large jump tables (PR #144108)
https://github.com/aengelke created https://github.com/llvm/llvm-project/pull/144108 Deleting a basic block removes all references from jump tables, which is O(n). When freeing a MachineFunction, all basic blocks are deleted before the jump tables, causing O(n^2) runtime. Fix this by deallocating the jump table first. Test case generator: import sys n = int(sys.argv[1]) print("define void @f(i64 %c, ptr %p) {") print(" switch i64 %c, label %d [") for i in range(n): print(f"i64 {i}, label %h{i}") print(f" ]") for i in range(n): print(f'h{i}:') print(f' store i64 {i*i}, ptr %p') print(f' ret void') print('d:') print(' ret void') print('}') ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] [libc++][C++03] Fix tests which only fail due to incorrect includes (PR #144110)
https://github.com/philnik777 created https://github.com/llvm/llvm-project/pull/144110 Quite a few of the frozen header tests only fail because the include path is incorrect due to copying the headers. This patch fixes the tests where that's the only problem. This is part of https://discourse.llvm.org/t/rfc-freezing-c-03-headers-in-libc. >From 748f899d6b70933aa50f73bbe45ab198b8aacc38 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Fri, 13 Jun 2025 18:14:22 +0200 Subject: [PATCH] [libc++][C++03] Fix tests which only fail due to incorrect includes --- .../test/libcxx-03/algorithms/half_positive.pass.cpp | 4 +--- .../assertions/default_verbose_abort.pass.cpp | 4 +--- libcxx/test/libcxx-03/assertions/modes/none.pass.cpp | 4 +--- .../libcxx-03/assertions/single_expression.pass.cpp| 4 +--- .../associative/tree_balance_after_insert.pass.cpp | 4 +--- .../associative/tree_key_value_traits.pass.cpp | 4 +--- .../containers/associative/tree_left_rotate.pass.cpp | 4 +--- .../containers/associative/tree_remove.pass.cpp| 4 +--- .../containers/associative/tree_right_rotate.pass.cpp | 4 +--- .../containers/unord/key_value_traits.pass.cpp | 4 +--- .../libcxx-03/containers/unord/next_prime.pass.cpp | 4 +--- .../libcxx-03/depr/depr.c.headers/extern_c.pass.cpp| 4 +--- .../libcxx-03/iterators/aliasing_iterator.pass.cpp | 4 +--- .../iterators/bounded_iter/arithmetic.pass.cpp | 4 +--- .../iterators/bounded_iter/pointer_traits.pass.cpp | 4 +--- .../iterators/bounded_iter/types.compile.pass.cpp | 4 +--- libcxx/test/libcxx-03/memory/allocation_guard.pass.cpp | 4 +--- libcxx/test/libcxx-03/memory/swap_allocator.pass.cpp | 4 +--- .../test/libcxx-03/numerics/clamp_to_integral.pass.cpp | 4 +--- libcxx/test/libcxx-03/selftest/test_macros.pass.cpp| 4 +--- .../strings/c.strings/constexpr_memmove.pass.cpp | 4 +--- .../is_trivially_comparable.compile.pass.cpp | 8 +++- .../is_trivially_relocatable.compile.pass.cpp | 4 +--- .../libcxx-03/utilities/exception_guard.odr.sh.cpp | 4 +--- .../libcxx-03/utilities/is_pointer_in_range.pass.cpp | 4 +--- .../test/libcxx-03/utilities/is_valid_range.pass.cpp | 4 +--- .../test/libcxx-03/utilities/meta/meta_base.pass.cpp | 10 -- libcxx/test/libcxx-03/utilities/no_destroy.pass.cpp| 4 +--- .../utility/private_constructor_tag.compile.pass.cpp | 4 +--- 29 files changed, 34 insertions(+), 92 deletions(-) diff --git a/libcxx/test/libcxx-03/algorithms/half_positive.pass.cpp b/libcxx/test/libcxx-03/algorithms/half_positive.pass.cpp index 88a18e8592921..292fcf356554b 100644 --- a/libcxx/test/libcxx-03/algorithms/half_positive.pass.cpp +++ b/libcxx/test/libcxx-03/algorithms/half_positive.pass.cpp @@ -11,9 +11,7 @@ // __half_positive divides an integer number by 2 as unsigned number for known types. // It can be an important optimization for lower bound, for example. -// XFAIL: FROZEN-CXX03-HEADERS-FIXME - -#include <__algorithm/half_positive.h> +#include <__cxx03/__algorithm/half_positive.h> #include #include #include diff --git a/libcxx/test/libcxx-03/assertions/default_verbose_abort.pass.cpp b/libcxx/test/libcxx-03/assertions/default_verbose_abort.pass.cpp index 803868b757794..27169da5e1c41 100644 --- a/libcxx/test/libcxx-03/assertions/default_verbose_abort.pass.cpp +++ b/libcxx/test/libcxx-03/assertions/default_verbose_abort.pass.cpp @@ -9,9 +9,7 @@ // Test that the default verbose termination function aborts the program. // XFAIL: availability-verbose_abort-missing -// XFAIL: FROZEN-CXX03-HEADERS-FIXME - -#include <__verbose_abort> +#include <__cxx03/__verbose_abort> #include #include diff --git a/libcxx/test/libcxx-03/assertions/modes/none.pass.cpp b/libcxx/test/libcxx-03/assertions/modes/none.pass.cpp index b64290a31a129..e79dee906ae69 100644 --- a/libcxx/test/libcxx-03/assertions/modes/none.pass.cpp +++ b/libcxx/test/libcxx-03/assertions/modes/none.pass.cpp @@ -11,9 +11,7 @@ // REQUIRES: libcpp-hardening-mode=none -// XFAIL: FROZEN-CXX03-HEADERS-FIXME - -#include <__assert> +#include <__cxx03/__assert> #include bool executed_condition = false; diff --git a/libcxx/test/libcxx-03/assertions/single_expression.pass.cpp b/libcxx/test/libcxx-03/assertions/single_expression.pass.cpp index 474edc9dc0833..bbda6f11e4f6a 100644 --- a/libcxx/test/libcxx-03/assertions/single_expression.pass.cpp +++ b/libcxx/test/libcxx-03/assertions/single_expression.pass.cpp @@ -10,9 +10,7 @@ // This is useful so we can use them in places that require an expression, such as // in a constructor initializer list. -// XFAIL: FROZEN-CXX03-HEADERS-FIXME - -#include <__assert> +#include <__cxx03/__assert> #include void f() { diff --git a/libcxx/test/libcxx-03/containers/associative/tree_balance_after_insert.pass.cpp b/libcxx/test/libcxx-03/containers/associative/tree_balance_after_insert.pa
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -14944,6 +14945,51 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performPtrAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N1.getOpcode() == ISD::ADD) { +// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, +//y is not, and (add y, z) is used only once. +// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, +//z is not, and (add y, z) is used only once. +// The goal is to move constant offsets to the outermost ptradd, to create +// more opportunities to fold offsets into memory instructions. +// Together with the generic combines in DAGCombiner.cpp, this also +// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). +// +// This transform is here instead of in the general DAGCombiner as it can +// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for +// AArch64's CPA. +SDValue X = N0; +SDValue Y = N1.getOperand(0); +SDValue Z = N1.getOperand(1); +bool N1OneUse = N1.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); +if ((ZIsConstant != YIsConstant) && N1OneUse) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && + N1->getFlags().hasNoUnsignedWrap()) +Flags |= SDNodeFlags::NoUnsignedWrap; ritter-x2a wrote: Done (here and also for similar code in DAGCombiner.cpp). https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][OpenMP][NFC] Refactor to avoid global variable (PR #144087)
https://github.com/tblah created https://github.com/llvm/llvm-project/pull/144087 Based on top of #144013 I was really hoping this would also work for `hostEvalInfo` but unfortunately that needed to be shared to a greater degree. The same technique should work for that but it would need that class to be made public and then the state kept between calls to `genOpenMP*Construct`, which felt like more trouble than it was worth. I'm open to abandoning this patch if solving one global variable doesn't feel worth this much churn. Making these changes I was wondering if we should implement this file with one big class to wrap up all the state passed to every function. Any thoughts? >From b962af9da5a74b2b5509f654299c3b9c35dca05d Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Fri, 13 Jun 2025 14:58:56 + Subject: [PATCH] [flang][OpenMP][NFC] Refactor to avoid global variable I was really hoping this would also work for `hostEvalInfo` but unfortunately that needed to be shared to a greater degree. The same technique should work for that but it would need that class to be made public and then the state kept between calls to `genOpenMP*Construct`, which felt like more trouble than it was worth. I'm open to abandoning this patch if solving one global variable doesn't feel worth this much churn. Making these changes I was wondering if we should implement this file with one big class to wrap up all the state passed to every function. Any thoughts? --- flang/lib/Lower/OpenMP/OpenMP.cpp | 560 +- 1 file changed, 310 insertions(+), 250 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 060eba1b906e3..9c0bfa95f8382 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -48,6 +48,10 @@ using namespace Fortran::common::openmp; static llvm::cl::opt DumpAtomicAnalysis("fdebug-dump-atomic-analysis"); +namespace { +struct OmpLoweringContext; +} // namespace + //===--===// // Code generation helper functions //===--===// @@ -55,6 +59,7 @@ static llvm::cl::opt DumpAtomicAnalysis("fdebug-dump-atomic-analysis"); static void genOMPDispatch(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, + OmpLoweringContext &ompCtx, lower::pft::Evaluation &eval, mlir::Location loc, const ConstructQueue &queue, ConstructQueue::const_iterator item); @@ -191,18 +196,28 @@ class HostEvalInfo { llvm::SmallVector iv; bool loopNestApplied = false, parallelApplied = false; }; -} // namespace /// Stack of \see HostEvalInfo to represent the current nest of \c omp.target /// operations being created. /// /// The current implementation prevents nested 'target' regions from breaking /// the handling of the outer region by keeping a stack of information -/// structures, but it will probably still require some further work to support -/// reverse offloading. -static llvm::SmallVector hostEvalInfo; -static llvm::SmallVector -sectionsStack; +/// structures, but it will probably still require some further work to +/// support reverse offloading. +/// +/// This has to be a global rather than in OmpLoweringContext because different +/// calls to void Fortran::lower::genOpenMPConstruct and +/// Fortran::lower::genOpenMPDeclarativeConstruct need to share the same +/// instance. FIXME: Maybe this should be promoted into the interface for those +/// functions. +llvm::SmallVector hostEvalInfo; + +struct OmpLoweringContext { + /// Stack of parse tree information about the sections construct to allow each + /// section to be lowered as part of the enclosing sections construct. + llvm::SmallVector sectionsStack; +}; +} // namespace /// Bind symbols to their corresponding entry block arguments. /// @@ -1151,10 +1166,11 @@ struct OpWithBodyGenInfo { OpWithBodyGenInfo(lower::AbstractConverter &converter, lower::SymMap &symTable, -semantics::SemanticsContext &semaCtx, mlir::Location loc, +semantics::SemanticsContext &semaCtx, +OmpLoweringContext &ompCtx, mlir::Location loc, lower::pft::Evaluation &eval, llvm::omp::Directive dir) - : converter(converter), symTable(symTable), semaCtx(semaCtx), loc(loc), -eval(eval), dir(dir) {} + : converter(converter), symTable(symTable), semaCtx(semaCtx), +ompCtx(ompCtx), loc(loc), eval(eval), dir(dir) {} OpWithBodyGenInfo &setClauses(const List *value) { clauses = value; @@ -1187,6 +1203,8 @@ struct OpWithBodyGenInfo { lower::SymMap &symTable; /// [in] Semantics context semantic
[llvm-branch-commits] [flang] [flang][OpenMP][NFC] Refactor to avoid global variable (PR #144087)
llvmbot wrote: @llvm/pr-subscribers-flang-fir-hlfir Author: Tom Eccles (tblah) Changes Based on top of #144013 I was really hoping this would also work for `hostEvalInfo` but unfortunately that needed to be shared to a greater degree. The same technique should work for that but it would need that class to be made public and then the state kept between calls to `genOpenMP*Construct`, which felt like more trouble than it was worth. I'm open to abandoning this patch if solving one global variable doesn't feel worth this much churn. Making these changes I was wondering if we should implement this file with one big class to wrap up all the state passed to every function. Any thoughts? --- Patch is 75.76 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144087.diff 1 Files Affected: - (modified) flang/lib/Lower/OpenMP/OpenMP.cpp (+310-250) ``diff diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 060eba1b906e3..9c0bfa95f8382 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -48,6 +48,10 @@ using namespace Fortran::common::openmp; static llvm::cl::opt DumpAtomicAnalysis("fdebug-dump-atomic-analysis"); +namespace { +struct OmpLoweringContext; +} // namespace + //===--===// // Code generation helper functions //===--===// @@ -55,6 +59,7 @@ static llvm::cl::opt DumpAtomicAnalysis("fdebug-dump-atomic-analysis"); static void genOMPDispatch(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, + OmpLoweringContext &ompCtx, lower::pft::Evaluation &eval, mlir::Location loc, const ConstructQueue &queue, ConstructQueue::const_iterator item); @@ -191,18 +196,28 @@ class HostEvalInfo { llvm::SmallVector iv; bool loopNestApplied = false, parallelApplied = false; }; -} // namespace /// Stack of \see HostEvalInfo to represent the current nest of \c omp.target /// operations being created. /// /// The current implementation prevents nested 'target' regions from breaking /// the handling of the outer region by keeping a stack of information -/// structures, but it will probably still require some further work to support -/// reverse offloading. -static llvm::SmallVector hostEvalInfo; -static llvm::SmallVector -sectionsStack; +/// structures, but it will probably still require some further work to +/// support reverse offloading. +/// +/// This has to be a global rather than in OmpLoweringContext because different +/// calls to void Fortran::lower::genOpenMPConstruct and +/// Fortran::lower::genOpenMPDeclarativeConstruct need to share the same +/// instance. FIXME: Maybe this should be promoted into the interface for those +/// functions. +llvm::SmallVector hostEvalInfo; + +struct OmpLoweringContext { + /// Stack of parse tree information about the sections construct to allow each + /// section to be lowered as part of the enclosing sections construct. + llvm::SmallVector sectionsStack; +}; +} // namespace /// Bind symbols to their corresponding entry block arguments. /// @@ -1151,10 +1166,11 @@ struct OpWithBodyGenInfo { OpWithBodyGenInfo(lower::AbstractConverter &converter, lower::SymMap &symTable, -semantics::SemanticsContext &semaCtx, mlir::Location loc, +semantics::SemanticsContext &semaCtx, +OmpLoweringContext &ompCtx, mlir::Location loc, lower::pft::Evaluation &eval, llvm::omp::Directive dir) - : converter(converter), symTable(symTable), semaCtx(semaCtx), loc(loc), -eval(eval), dir(dir) {} + : converter(converter), symTable(symTable), semaCtx(semaCtx), +ompCtx(ompCtx), loc(loc), eval(eval), dir(dir) {} OpWithBodyGenInfo &setClauses(const List *value) { clauses = value; @@ -1187,6 +1203,8 @@ struct OpWithBodyGenInfo { lower::SymMap &symTable; /// [in] Semantics context semantics::SemanticsContext &semaCtx; + /// [in] OpenMP context + OmpLoweringContext &ompCtx; /// [in] location in source code. mlir::Location loc; /// [in] current PFT node/evaluation. @@ -1290,8 +1308,8 @@ static void createBodyOfOp(mlir::Operation &op, const OpWithBodyGenInfo &info, if (!info.genSkeletonOnly) { if (ConstructQueue::const_iterator next = std::next(item); next != queue.end()) { - genOMPDispatch(info.converter, info.symTable, info.semaCtx, info.eval, - info.loc, queue, next); + genOMPDispatch(info.converter, info.symTable, info.semaCtx, info.ompCtx, + info.eval, info.loc, queue, next);
[llvm-branch-commits] [llvm] 6bf398b - Revert "[llvm-cov] Add support for baseline coverage (#117910)"
Author: Keith Smiley Date: 2025-06-13T10:04:08-07:00 New Revision: 6bf398b89982b4a47edf48ce2c8c627e8a94ccf9 URL: https://github.com/llvm/llvm-project/commit/6bf398b89982b4a47edf48ce2c8c627e8a94ccf9 DIFF: https://github.com/llvm/llvm-project/commit/6bf398b89982b4a47edf48ce2c8c627e8a94ccf9.diff LOG: Revert "[llvm-cov] Add support for baseline coverage (#117910)" This reverts commit dc9e300f12f3b9c8160dbfb0bc32252ad99c3ba7. Added: Modified: llvm/docs/CommandGuide/llvm-cov.rst llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h llvm/lib/ProfileData/Coverage/CoverageMapping.cpp llvm/tools/llvm-cov/CodeCoverage.cpp llvm/unittests/ProfileData/CoverageMappingTest.cpp Removed: llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test diff --git a/llvm/docs/CommandGuide/llvm-cov.rst b/llvm/docs/CommandGuide/llvm-cov.rst index f4db60cf06fa7..968f3c452f558 100644 --- a/llvm/docs/CommandGuide/llvm-cov.rst +++ b/llvm/docs/CommandGuide/llvm-cov.rst @@ -380,11 +380,6 @@ OPTIONS Fail if an object file cannot be found for a binary ID present in the profile, neither on the command line nor via binary ID lookup. -.. option:: -empty-profile - - Display the baseline coverage of the binaries with all zero execution counts. - Mutually exclusive with -instr-profile. - .. program:: llvm-cov report .. _llvm-cov-report: @@ -475,11 +470,6 @@ OPTIONS Fail if an object file cannot be found for a binary ID present in the profile, neither on the command line nor via binary ID lookup. -.. option:: -empty-profile - - Display the baseline coverage of the binaries with all zero execution counts. - Mutually exclusive with -instr-profile. - .. program:: llvm-cov export .. _llvm-cov-export: @@ -572,11 +562,6 @@ OPTIONS Fail if an object file cannot be found for a binary ID present in the profile, neither on the command line nor via binary ID lookup. -.. option:: -empty-profile - - Export the baseline coverage of the binaries with all zero execution counts. - Mutually exclusive with -instr-profile. - CONVERT-FOR-TESTING COMMAND --- diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h index d1230b0ba7c58..e62ce5e3d8fa6 100644 --- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h +++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h @@ -991,23 +991,18 @@ class CoverageMapping { // Load coverage records from readers. static Error loadFromReaders( ArrayRef> CoverageReaders, - std::optional> - &ProfileReader, - CoverageMapping &Coverage); + IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage); // Load coverage records from file. static Error loadFromFile(StringRef Filename, StringRef Arch, StringRef CompilationDir, - std::optional> - &ProfileReader, - CoverageMapping &Coverage, bool &DataFound, + IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage, + bool &DataFound, SmallVectorImpl *FoundBinaryIDs = nullptr); /// Add a function record corresponding to \p Record. - Error loadFunctionRecord( - const CoverageMappingRecord &Record, - const std::optional> - &ProfileReader); + Error loadFunctionRecord(const CoverageMappingRecord &Record, + IndexedInstrProfReader &ProfileReader); /// Look up the indices for function records which are at least partially /// defined in the specified file. This is guaranteed to return a superset of @@ -1023,16 +1018,15 @@ class CoverageMapping { /// Load the coverage mapping using the given readers. LLVM_ABI static Expected> load(ArrayRef> CoverageReaders, - std::optional> - &ProfileReader); + IndexedInstrProfReader &ProfileReader); /// Load the coverage mapping from the given object files and profile. If /// \p Arches is non-empty, it must specify an architecture for each object. /// Ignores non-instrumented object files unless all are not instrumented. LLVM_ABI static Expected> - load(ArrayRef ObjectFilenames, - std::optional ProfileFilename, vfs::FileSystem &FS, - ArrayRef Arches = {}, StringRef CompilationDir = "", + load(ArrayRef ObjectFilenames, StringRef ProfileFilename, + vfs::FileSystem &FS, ArrayRef Arches = {}, + StringRef CompilationDir = "", const object::BuildIDFetcher *BIDFetcher = nullptr, bool CheckBinaryIDs = false); diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp index 429ec5c19f1f8..dd74eb054a34c 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp @@ -823,8 +823,7 @@ class MCDCD
[llvm-branch-commits] [mlir] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` (PR #142422)
https://github.com/newling approved this pull request. LGTM; thanks! https://github.com/llvm/llvm-project/pull/142422 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] 2238fd9 - Revert "[mlir][vector] Fix for WarpOpScfForOp failure when scf.for has result…"
Author: Charitha Saumya Date: 2025-06-13T10:18:24-07:00 New Revision: 2238fd9a756ae1a0b6aa2302e96cc217b08d6c3b URL: https://github.com/llvm/llvm-project/commit/2238fd9a756ae1a0b6aa2302e96cc217b08d6c3b DIFF: https://github.com/llvm/llvm-project/commit/2238fd9a756ae1a0b6aa2302e96cc217b08d6c3b.diff LOG: Revert "[mlir][vector] Fix for WarpOpScfForOp failure when scf.for has result…" This reverts commit 10dc8bc519130f491d70318bd8b47555307cdc3f. Added: Modified: mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp mlir/test/Dialect/Vector/vector-warp-distribute.mlir Removed: diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp index 52a9cedb43cc0..045c192787f10 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -1554,36 +1554,22 @@ struct WarpOpScfForOp : public WarpDistributionPattern { llvm::SmallSetVector escapingValues; SmallVector inputTypes; SmallVector distTypes; -auto collectEscapingValues = [&](Value value) { - if (!escapingValues.insert(value)) -return; - Type distType = value.getType(); - if (auto vecType = dyn_cast(distType)) { -AffineMap map = distributionMapFn(value); -distType = getDistributedType(vecType, map, warpOp.getWarpSize()); - } - inputTypes.push_back(value.getType()); - distTypes.push_back(distType); -}; - mlir::visitUsedValuesDefinedAbove( forOp.getBodyRegion(), [&](OpOperand *operand) { Operation *parent = operand->get().getParentRegion()->getParentOp(); if (warpOp->isAncestor(parent)) { -collectEscapingValues(operand->get()); +if (!escapingValues.insert(operand->get())) + return; +Type distType = operand->get().getType(); +if (auto vecType = dyn_cast(distType)) { + AffineMap map = distributionMapFn(operand->get()); + distType = getDistributedType(vecType, map, warpOp.getWarpSize()); +} +inputTypes.push_back(operand->get().getType()); +distTypes.push_back(distType); } }); -// Any forOp result that is not already yielded by the warpOp -// region is also considered escaping and must be returned by the -// original warpOp. -for (OpResult forResult : forOp.getResults()) { - // Check if this forResult is already yielded by the yield op. - if (llvm::is_contained(yield->getOperands(), forResult)) -continue; - collectEscapingValues(forResult); -} - if (llvm::is_contained(distTypes, Type{})) return failure(); @@ -1623,12 +1609,7 @@ struct WarpOpScfForOp : public WarpDistributionPattern { forOp.getResultTypes().end()); llvm::SmallDenseMap argIndexMapping; for (auto [i, retIdx] : llvm::enumerate(newRetIndices)) { - auto newWarpResult = newWarpOp.getResult(retIdx); - // Unused forOp results yielded by the warpOp region are already included - // in the new ForOp. - if (llvm::is_contained(newOperands, newWarpResult)) -continue; - warpInput.push_back(newWarpResult); + warpInput.push_back(newWarpOp.getResult(retIdx)); argIndexMapping[escapingValues[i]] = warpInputType.size(); warpInputType.push_back(inputTypes[i]); } diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir index 6c7ac7a5196a7..38771f2593449 100644 --- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir +++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir @@ -584,42 +584,6 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref, %arg2 return } -// - -// CHECK-PROP-LABEL: func.func @warp_scf_for_unused_yield( -// CHECK-PROP: %[[W0:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[INI0:.*]] = "some_def"() : () -> vector<128xf32> -// CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> -// CHECK-PROP: gpu.yield %[[INI0]], %[[INI1]] : vector<128xf32>, vector<128xf32> -// CHECK-PROP: } -// CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} iter_args(%{{.*}} = %[[W0]]#0, %{{.*}} = %[[W0]]#1) -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[ACC0:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>, index) -> vector<128xf32> -// CHECK-PROP: %[[ACC1:.*]] = "some_def"(%{{.*}}) : (index, vector<128xf32>, vector<128xf32>) -> vector<128xf32> -// CHECK-PROP: gpu.yield %[[ACC1]], %[[ACC0]] : vector<12
[llvm-branch-commits] [mlir] [MLIR] Fix incorrect slice contiguity inference in `vector::isContiguousSlice` (PR #142422)
@@ -83,16 +84,48 @@ func.func @transfer_read_dims_mismatch_contiguous( return %res : vector<1x1x2x2xi8> } -// CHECK-LABEL: func.func @transfer_read_dims_mismatch_contiguous( +// CHECK-LABEL: func.func @transfer_read_dims_mismatch_contiguous_unit_dims( // CHECK-SAME: %[[MEM:.*]]: memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>) -> vector<1x1x2x2xi8> { // CHECK: %[[VAL_1:.*]] = arith.constant 0 : i8 // CHECK: %[[VAL_2:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_3:.*]] = memref.collapse_shape %[[MEM]] {{\[\[}}0, 1, 2, 3]] : memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>> into memref<120xi8, strided<[1], offset: ?>> -// CHECK: %[[VAL_4:.*]] = vector.transfer_read %[[VAL_3]]{{\[}}%[[VAL_2]]], %[[VAL_1]] {in_bounds = [true]} : memref<120xi8, strided<[1], offset: ?>>, vector<4xi8> +// CHECK: %[[VAL_3:.*]] = memref.collapse_shape %[[MEM]] +// CHECK-SAME{LITERAL}: [[0], [1], [2, 3]] +// CHECK-SAME:: memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>> into memref<5x4x6xi8, strided<[24, 6, 1], offset: ?>> +// CHECK: %[[VAL_4:.*]] = vector.transfer_read %[[VAL_3]][%[[VAL_2]], %[[VAL_2]], %[[VAL_2]]], %[[VAL_1]] {in_bounds = [true]} : memref<5x4x6xi8, strided<[24, 6, 1], offset: ?>>, vector<4xi8> // CHECK: %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<4xi8> to vector<1x1x2x2xi8> // CHECK: return %[[VAL_5]] : vector<1x1x2x2xi8> -// CHECK-128B-LABEL: func @transfer_read_dims_mismatch_contiguous( +// CHECK-128B-LABEL: func @transfer_read_dims_mismatch_contiguous_unit_dims( +// CHECK-128B: memref.collapse_shape + +// - + +// The shape of the memref and the vector don't match, but the vector is a +// contiguous subset of the memref, so "flattenable" + +func.func @transfer_read_dims_mismatch_contiguous_non_unit_dims( +%mem : memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>) -> vector<2x3x2xi8> { + + %c0 = arith.constant 0 : index + %cst = arith.constant 0 : i8 + %res = vector.transfer_read %mem[%c0, %c0, %c0, %c0], %cst : +memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>, vector<2x3x2xi8> + return %res : vector<2x3x2xi8> +} + +// CHECK-LABEL: func.func @transfer_read_dims_mismatch_contiguous_non_unit_dims( +// CHECK-SAME:%[[MEM:.+]]: memref<5x4x3x2xi8, {{.+}}>) -> vector<2x3x2xi8> { +// CHECK: %[[C0_I8:.+]] = arith.constant 0 : i8 +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[COLLAPSED_MEM:.+]] = memref.collapse_shape %[[MEM]] +// CHECK-SAME{LITERAL}: [[0], [1, 2, 3]] +// CHECK-SAME: : memref<5x4x3x2xi8, {{.+}}> into memref<5x24xi8, {{.+}}> +// CHECK: %[[VEC_1D:.+]] = vector.transfer_read %[[COLLAPSED_MEM]][%[[C0]], %[[C0]]], %[[C0_I8]] {in_bounds = [true]} +// CHECK-SAME: : memref<5x24xi8, strided<[24, 1], offset: ?>>, vector<12xi8> +// CHECK: %[[VEC:.+]] = vector.shape_cast %[[VEC_1D]] : vector<12xi8> to vector<2x3x2xi8> +// CHECK: return %[[VEC]] : vector<2x3x2xi8> momchil-velikov wrote: I don't understand the rationale behind having these in a particular order. https://github.com/llvm/llvm-project/pull/142422 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -2628,6 +2630,87 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0) && PtrVT == IntVT) +return N1; ritter-x2a wrote: I've applied the suggested change for now. https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] llvm-lto2: Add print-guid subcommand. (PR #143992)
https://github.com/teresajohnson commented: This needs a caveat somewhere (either in printed usage message or in a comment) that this won't work for local linkage symbols (I suppose the user could give the "file:" prefix but that won't work if -funique-internal-linkage-names was specified etc). Can you also add a test? https://github.com/llvm/llvm-project/pull/143992 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Handle ISD::PTRADD in VOP3 patterns (PR #143881)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143881 >From 46090a8031fde937a76268ce7adbbdc6f42911ad Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Thu, 12 Jun 2025 07:44:37 -0400 Subject: [PATCH] [AMDGPU][SDAG] Handle ISD::PTRADD in VOP3 patterns This patch mirrors similar patterns for ISD::ADD. The main difference is that ISD::ADD is commutative, so that a pattern definition for, e.g., (add (mul x, y), z), automatically also handles (add z, (mul x, y)). ISD::PTRADD is not commutative, so we would need to handle these cases explicitly. This patch only implements (ptradd z, (op x, y)) patterns, where the nested operation (shift or multiply) is the offset of the ptradd (i.e., the right operand), since base pointers that are the result of a shift or multiply seem less likely. For SWDEV-516125. --- llvm/lib/Target/AMDGPU/VOP3Instructions.td| 36 +++- .../AMDGPU/ptradd-sdag-optimizations.ll | 41 ++ llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll | 42 +++ 3 files changed, 52 insertions(+), 67 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index a005e0245b8ff..8054e75782539 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -484,12 +484,13 @@ let OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue defm: Ternary_i16_Pats_gfx9; } // End OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue16BitInsts -class ThreeOpFragSDAG : PatFrag< +class ThreeOpFragSDAG : PatFrag< (ops node:$x, node:$y, node:$z), // When the inner operation is used multiple times, selecting 3-op // instructions may still be beneficial -- if the other users can be // combined similarly. Let's be conservative for now. - (op2 (HasOneUseBinOp node:$x, node:$y), node:$z), + !if(op1IsRight, (op2 node:$z, (HasOneUseBinOp node:$x, node:$y)), + (op2 (HasOneUseBinOp node:$x, node:$y), node:$z)), [{ // Only use VALU ops when the result is divergent. if (!N->isDivergent()) @@ -516,7 +517,10 @@ class ThreeOpFragSDAG : PatFrag< let PredicateCodeUsesOperands = 1; } -class ThreeOpFrag : ThreeOpFragSDAG { +// Matches (op2 (op1 x, y), z) if op1IsRight = 0 and +// matches (op2 z, (op1, x, y)) if op1IsRight = 1. +class ThreeOpFrag : ThreeOpFragSDAG { // The divergence predicate is irrelevant in GlobalISel, as we have // proper register bank checks. We just need to verify the constant // bus restriction when all the sources are considered. @@ -806,12 +810,19 @@ def : GCNPat< (DivergentBinFrag i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; -let SubtargetPredicate = isGFX940Plus in +let SubtargetPredicate = isGFX940Plus in { def : GCNPat< (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) >; +def : GCNPat < + // (ptradd z, (shl x, y)) -> ((x << y) + z) + (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), + (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) +>; +} // End SubtargetPredicate = isGFX940Plus + def : VOPBinOpClampPat; def : VOPBinOpClampPat; @@ -880,19 +891,24 @@ multiclass IMAD32_Pats { // Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul. // We need to separate this because otherwise OtherPredicates would be overriden. -class IMAD32_Mul24_Pat: GCNPat < -(i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)), -(inst $src0, $src1, $src2, 0 /* clamp */) ->; +class IMAD32_Mul24_Pats_Impl : GCNPat < +!if(mulIsRight, (i64 (AddOp i64:$src2, (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1, +(i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2))), +(inst $src0, $src1, $src2, 0 /* clamp */)>; + +multiclass IMAD32_Mul24_Pats { + def : IMAD32_Mul24_Pats_Impl; + def : IMAD32_Mul24_Pats_Impl; +} // exclude pre-GFX9 where it was slow let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in { defm : IMAD32_Pats; - def : IMAD32_Mul24_Pat; + defm : IMAD32_Mul24_Pats; } let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in { defm : IMAD32_Pats; - def : IMAD32_Mul24_Pat; + defm : IMAD32_Mul24_Pats; } def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index d48bfe0bb7f21..34bb98550de04 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -266,18 +266,11 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) { ; Use non-zero shift amounts in v_lshl_add_u64. define ptr @select_v_lshl_add_u64(ptr %base,
[llvm-branch-commits] [libcxx] [libc++][C++03] Remove XFAILs from the non-frozen libc++-specific tests (PR #144101)
github-actions[bot] wrote: :warning: Python code formatter, darker found issues in your code. :warning: You can test this locally with the following command: ``bash darker --check --diff -r HEAD~1...HEAD libcxx/test/libcxx/clang_modules_include.gen.py libcxx/test/libcxx/clang_tidy.gen.py libcxx/test/libcxx/header_inclusions.gen.py libcxx/test/libcxx/system_reserved_names.gen.py libcxx/test/libcxx/transitive_includes.gen.py `` View the diff from darker here. ``diff --- clang_tidy.gen.py 2025-06-13 15:49:01.00 + +++ clang_tidy.gen.py 2025-06-13 15:53:41.838897 + @@ -17,11 +17,12 @@ import sys sys.path.append(sys.argv[1]) from libcxx.header_information import lit_header_restrictions, lit_header_undeprecations, public_headers for header in public_headers: - print(f"""\ +print( +f"""\ //--- {header}.sh.cpp // REQUIRES: has-clang-tidy // The GCC compiler flags are not always compatible with clang-tidy. @@ -32,6 +33,7 @@ // TODO: run clang-tidy with modules enabled once they are supported // RUN: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx-dir}}/.clang-tidy --load=%{{test-tools-dir}}/clang_tidy_checks/libcxx-tidy.plugin -- -Wweak-vtables %{{compile_flags}} -fno-modules #include <{header}> -""") +""" +) `` https://github.com/llvm/llvm-project/pull/144101 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] llvm-lto2: Add print-guid subcommand. (PR #143992)
https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/143992 >From f11d7d544cc61dce582de538608bfd512147f90a Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 12 Jun 2025 16:06:14 -0700 Subject: [PATCH 1/2] Upload correct patch Created using spr 1.3.6-beta.1 --- llvm/tools/llvm-lto2/llvm-lto2.cpp | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp index 5f5c954c6a57d..d35868ffafe1e 100644 --- a/llvm/tools/llvm-lto2/llvm-lto2.cpp +++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -610,7 +610,9 @@ int main(int argc, char **argv) { return dumpSymtab(argc - 1, argv + 1); if (Subcommand == "run") return run(argc - 1, argv + 1); - if (Subcommand == "print-guid" && argc > 2) -outs() << GlobalValue::getGUIDAssumingExternalLinkage(argv[2]); + if (Subcommand == "print-guid" && argc > 2) { +outs() << GlobalValue::getGUIDAssumingExternalLinkage(argv[2]) << '\n'; +return 0; + } return usage(); } >From c7cb16abb3c30e54a12ec1b9ce325d49cf37d2bc Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 13 Jun 2025 14:34:28 -0700 Subject: [PATCH 2/2] Add comment Created using spr 1.3.6-beta.1 --- llvm/tools/llvm-lto2/llvm-lto2.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp index d35868ffafe1e..fbde6a596 100644 --- a/llvm/tools/llvm-lto2/llvm-lto2.cpp +++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -611,6 +611,8 @@ int main(int argc, char **argv) { if (Subcommand == "run") return run(argc - 1, argv + 1); if (Subcommand == "print-guid" && argc > 2) { +// Note the name of the function we're calling: this won't return the right +// answer for internal linkage symbols. outs() << GlobalValue::getGUIDAssumingExternalLinkage(argv[2]) << '\n'; return 0; } ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] llvm-lto2: Add print-guid subcommand. (PR #143992)
pcc wrote: > At least a comment in the code would be good. A variety of people end up > using these tools for tests, and I could see someone getting confused as to > why the guid doesn't match what's e.g. in the ThinLTO index. For that > understanding you'd have to read more than just what this source file is > calling. I added a comment. https://github.com/llvm/llvm-project/pull/143992 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [MSAN] handle assorted AVX permutations (PR #143462)
https://github.com/fmayer updated https://github.com/llvm/llvm-project/pull/143462 >From e7f58f76d921bdf3e7f4a585a25a2612d66fee33 Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Fri, 13 Jun 2025 15:14:20 -0700 Subject: [PATCH] assert Created using spr 1.3.4 --- .../Instrumentation/MemorySanitizer.cpp | 8 ++ .../X86/avx512vl-intrinsics.ll| 73 --- 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 2ede88d0f0b37..fb55bd7bfe567 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4174,6 +4174,14 @@ struct MemorySanitizerVisitor : public InstVisitor { // Instrument AVX permutation intrinsic. // We apply the same permutation (argument index 1) to the shadow. void handleAVXPermutation(IntrinsicInst &I) { +assert(I.arg_size() == 2); +assert(isa(I.getArgOperand(0)->getType())); +assert(isa(I.getArgOperand(1)->getType())); +[[maybe_unused]] auto ArgVectorSize = +cast(I.getArgOperand(0)->getType())->getNumElements(); +assert(cast(I.getArgOperand(1)->getType()) + ->getNumElements() == ArgVectorSize); +assert(I.getType() == I.getArgOperand(0)->getType()); IRBuilder<> IRB(&I); Value *Shadow = getShadow(&I, 0); insertShadowCheck(I.getArgOperand(1), &I); diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll index 1a067ec67d218..40b5e9338e45e 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll @@ -8633,18 +8633,18 @@ define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i6 ; CHECK-NEXT:[[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT:[[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT:call void @llvm.donothing() -; CHECK-NEXT:[[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256 -; CHECK-NEXT:[[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT:[[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to <4 x double> +; CHECK-NEXT:[[TMP6:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[TMP3]], <4 x i64> [[X1]]) +; CHECK-NEXT:[[TMP7:%.*]] = bitcast <4 x double> [[TMP6]] to <4 x i64> ; CHECK-NEXT:[[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT:[[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT:[[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT:br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] -; CHECK: [[BB5]]: +; CHECK-NEXT:br i1 [[_MSCMP1]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: ; CHECK-NEXT:call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT:unreachable -; CHECK: [[BB6]]: +; CHECK: [[BB8]]: ; CHECK-NEXT:[[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]]) -; CHECK-NEXT:store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT:store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT:ret <4 x double> [[TMP1]] ; %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) @@ -8660,26 +8660,26 @@ define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 ; CHECK-NEXT:[[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 ; CHECK-NEXT:[[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT:call void @llvm.donothing() -; CHECK-NEXT:[[TMP14:%.*]] = bitcast <4 x i64> [[TMP8]] to i256 -; CHECK-NEXT:[[_MSCMP:%.*]] = icmp ne i256 [[TMP14]], 0 +; CHECK-NEXT:[[TMP14:%.*]] = bitcast <4 x i64> [[TMP8]] to <4 x double> +; CHECK-NEXT:[[TMP16:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[TMP14]], <4 x i64> [[X1]]) +; CHECK-NEXT:[[TMP18:%.*]] = bitcast <4 x double> [[TMP16]] to <4 x i64> ; CHECK-NEXT:[[TMP15:%.*]] = bitcast <4 x i64> [[TMP11]] to i256 ; CHECK-NEXT:[[_MSCMP1:%.*]] = icmp ne i256 [[TMP15]], 0 -; CHECK-NEXT:[[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT:br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] -; CHECK: [[BB7]]: +; CHECK-NEXT:br i1 [[_MSCMP1]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: ; CHECK-NEXT:call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT:unreachable -; CHECK:
[llvm-branch-commits] [MSAN] handle assorted AVX permutations (PR #143462)
https://github.com/fmayer updated https://github.com/llvm/llvm-project/pull/143462 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [MSAN] handle assorted AVX permutations (PR #143462)
https://github.com/fmayer updated https://github.com/llvm/llvm-project/pull/143462 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [MSAN] handle assorted AVX permutations (PR #143462)
fmayer wrote: ready for review https://github.com/llvm/llvm-project/pull/143462 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [MSAN] handle assorted AVX permutations (PR #143462)
https://github.com/thurstond approved this pull request. https://github.com/llvm/llvm-project/pull/143462 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [MSAN] handle AVX vpermi2var (PR #143463)
https://github.com/thurstond approved this pull request. https://github.com/llvm/llvm-project/pull/143463 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [MSAN] handle AVX vpermi2var (PR #143463)
@@ -4191,6 +4191,15 @@ struct MemorySanitizerVisitor : public InstVisitor { // We apply the same permutation (argument index 1) to the shadows. void handleAVXVpermil2var(IntrinsicInst &I) { assert(I.arg_size() == 3); +assert(isa(I.getArgOperand(0)->getType())); +assert(isa(I.getArgOperand(1)->getType())); +assert(isa(I.getArgOperand(2)->getType())); +[[maybe_unused]] auto ArgVectorSize = +cast(I.getArgOperand(0)->getType())->getNumElements(); +assert(cast(I.getArgOperand(1)->getType()) + ->getNumElements() == ArgVectorSize); +assert(cast(I.getArgOperand(2)->getType()) + ->getNumElements() == ArgVectorSize); thurstond wrote: Some of the assertions are redundant (e.g., if operand 0 is a vector, and operand 0's type is the same as operand 2's type, then operand 2 must be a vector with the same number of elements as operand 0), but that's fine. https://github.com/llvm/llvm-project/pull/143463 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libc] 9a2e40b - Revert "Turn LIBC_COPT_STRING_UNSAFE_WIDE_READ on by default (#144163)"
Author: Amy Huang Date: 2025-06-13T15:29:51-07:00 New Revision: 9a2e40b9eea4297631c2462a345d1cbc8d01f373 URL: https://github.com/llvm/llvm-project/commit/9a2e40b9eea4297631c2462a345d1cbc8d01f373 DIFF: https://github.com/llvm/llvm-project/commit/9a2e40b9eea4297631c2462a345d1cbc8d01f373.diff LOG: Revert "Turn LIBC_COPT_STRING_UNSAFE_WIDE_READ on by default (#144163)" This reverts commit a591bd222b2e0356b8132b515422fe480b87322b. Added: Modified: libc/config/config.json Removed: diff --git a/libc/config/config.json b/libc/config/config.json index 0354b16997cdd..d53b2936edb07 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -59,7 +59,7 @@ }, "string": { "LIBC_CONF_STRING_UNSAFE_WIDE_READ": { - "value": true, + "value": false, "doc": "Read more than a byte at a time to perform byte-string operations like strlen." }, "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": { ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CodeGen][NFC] Fix quadratic c-t for large jump tables (PR #144108)
https://github.com/aeubanks approved this pull request. makes sense. can you put some compile time numbers in the description before and after this patch? https://github.com/llvm/llvm-project/pull/144108 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [MSAN] handle AVX vpermi2var (PR #143463)
https://github.com/fmayer closed https://github.com/llvm/llvm-project/pull/143463 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] WebAssembly: Stop directly using RuntimeLibcalls.def (PR #143054)
@@ -528,23 +528,20 @@ RuntimeLibcallSignatureTable &getRuntimeLibcallSignatures() { // constructor for use with a static variable struct StaticLibcallNameMap { StringMap Map; - StaticLibcallNameMap() { -static const std::pair NameLibcalls[] = { -#define HANDLE_LIBCALL(code, name) {(const char *)name, RTLIB::code}, -#include "llvm/IR/RuntimeLibcalls.def" -#undef HANDLE_LIBCALL -}; -for (const auto &NameLibcall : NameLibcalls) { - if (NameLibcall.first != nullptr && - getRuntimeLibcallSignatures().Table[NameLibcall.second] != - unsupported) { -assert(!Map.contains(NameLibcall.first) && + StaticLibcallNameMap(const Triple &TT) { +// FIXME: This is broken if there are ever different triples compiled with +// different libcalls. +RTLIB::RuntimeLibcallsInfo RTCI(TT); +for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) { + RTLIB::Libcall LC = static_cast(I); + const char *NameLibcall = RTCI.getLibcallName(LC); + if (NameLibcall != nullptr && + getRuntimeLibcallSignatures().Table[LC] != unsupported) { +assert(!Map.contains(NameLibcall) && "duplicate libcall names in name map"); -Map[NameLibcall.first] = NameLibcall.second; +Map[NameLibcall] = LC; } } - -Map["emscripten_return_address"] = RTLIB::RETURN_ADDRESS; arsenm wrote: RuntimeLibcallsInfo directly sets this, this was already moved in the parent PR https://github.com/llvm/llvm-project/pull/143054 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] llvm-lto2: Add print-guid subcommand. (PR #143992)
teresajohnson wrote: > > This needs a caveat somewhere (either in printed usage message or in a > > comment) that this won't work for local linkage symbols (I suppose the user > > could give the "file:" prefix but that won't work if > > -funique-internal-linkage-names was specified etc). > > I'm not sure that is worth it. The intent is that users of these development > tools will refer to the source code. And if you read the source code you'll > see the function name `getGUIDAssumingExternalLinkage` which tells you what > you need to know. At least a comment in the code would be good. A variety of people end up using these tools for tests, and I could see someone getting confused as to why the guid doesn't match what's e.g. in the ThinLTO index. For that understanding you'd have to read more than just what this source file is calling. https://github.com/llvm/llvm-project/pull/143992 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] llvm-lto2: Add print-guid subcommand. (PR #143992)
https://github.com/teresajohnson approved this pull request. lgtm otherwise https://github.com/llvm/llvm-project/pull/143992 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] llvm-lto2: Add print-guid subcommand. (PR #143992)
pcc wrote: > This needs a caveat somewhere (either in printed usage message or in a > comment) that this won't work for local linkage symbols (I suppose the user > could give the "file:" prefix but that won't work if > -funique-internal-linkage-names was specified etc). I'm not sure that is worth it. The intent is that users of these development tools will refer to the source code. And if you read the source code you'll see the function name `getGUIDAssumingExternalLinkage` which tells you what you need to know. > Can you also add a test? Done. https://github.com/llvm/llvm-project/pull/143992 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] llvm-lto2: Add print-guid subcommand. (PR #143992)
https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/143992 >From f11d7d544cc61dce582de538608bfd512147f90a Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 12 Jun 2025 16:06:14 -0700 Subject: [PATCH] Upload correct patch Created using spr 1.3.6-beta.1 --- llvm/tools/llvm-lto2/llvm-lto2.cpp | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp index 5f5c954c6a57d..d35868ffafe1e 100644 --- a/llvm/tools/llvm-lto2/llvm-lto2.cpp +++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -610,7 +610,9 @@ int main(int argc, char **argv) { return dumpSymtab(argc - 1, argv + 1); if (Subcommand == "run") return run(argc - 1, argv + 1); - if (Subcommand == "print-guid" && argc > 2) -outs() << GlobalValue::getGUIDAssumingExternalLinkage(argv[2]); + if (Subcommand == "print-guid" && argc > 2) { +outs() << GlobalValue::getGUIDAssumingExternalLinkage(argv[2]) << '\n'; +return 0; + } return usage(); } ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] llvm-lto2: Add print-guid subcommand. (PR #143992)
https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/143992 >From f11d7d544cc61dce582de538608bfd512147f90a Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 12 Jun 2025 16:06:14 -0700 Subject: [PATCH] Upload correct patch Created using spr 1.3.6-beta.1 --- llvm/tools/llvm-lto2/llvm-lto2.cpp | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp index 5f5c954c6a57d..d35868ffafe1e 100644 --- a/llvm/tools/llvm-lto2/llvm-lto2.cpp +++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -610,7 +610,9 @@ int main(int argc, char **argv) { return dumpSymtab(argc - 1, argv + 1); if (Subcommand == "run") return run(argc - 1, argv + 1); - if (Subcommand == "print-guid" && argc > 2) -outs() << GlobalValue::getGUIDAssumingExternalLinkage(argv[2]); + if (Subcommand == "print-guid" && argc > 2) { +outs() << GlobalValue::getGUIDAssumingExternalLinkage(argv[2]) << '\n'; +return 0; + } return usage(); } ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CI] Test all projects when CI scripts change (PR #144034)
https://github.com/boomanaiden154 created https://github.com/llvm/llvm-project/pull/144034 This patch resolves a fixme in the compute_projects script to actually test all the projects we can when touching something in the .ci directory. This ensures we test things like compiler-rt before landing changes. ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Handle ISD::PTRADD in VOP3 patterns (PR #143881)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143881 >From f93590bac710750f993c86005c217b843cc5a863 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Thu, 12 Jun 2025 07:44:37 -0400 Subject: [PATCH] [AMDGPU][SDAG] Handle ISD::PTRADD in VOP3 patterns This patch mirrors similar patterns for ISD::ADD. The main difference is that ISD::ADD is commutative, so that a pattern definition for, e.g., (add (mul x, y), z), automatically also handles (add z, (mul x, y)). ISD::PTRADD is not commutative, so we would need to handle these cases explicitly. This patch only implements (ptradd z, (op x, y)) patterns, where the nested operation (shift or multiply) is the offset of the ptradd (i.e., the right operand), since base pointers that are the result of a shift or multiply seem less likely. For SWDEV-516125. --- llvm/lib/Target/AMDGPU/VOP3Instructions.td| 36 +++- .../AMDGPU/ptradd-sdag-optimizations.ll | 41 ++ llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll | 42 +++ 3 files changed, 52 insertions(+), 67 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index a005e0245b8ff..8054e75782539 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -484,12 +484,13 @@ let OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue defm: Ternary_i16_Pats_gfx9; } // End OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue16BitInsts -class ThreeOpFragSDAG : PatFrag< +class ThreeOpFragSDAG : PatFrag< (ops node:$x, node:$y, node:$z), // When the inner operation is used multiple times, selecting 3-op // instructions may still be beneficial -- if the other users can be // combined similarly. Let's be conservative for now. - (op2 (HasOneUseBinOp node:$x, node:$y), node:$z), + !if(op1IsRight, (op2 node:$z, (HasOneUseBinOp node:$x, node:$y)), + (op2 (HasOneUseBinOp node:$x, node:$y), node:$z)), [{ // Only use VALU ops when the result is divergent. if (!N->isDivergent()) @@ -516,7 +517,10 @@ class ThreeOpFragSDAG : PatFrag< let PredicateCodeUsesOperands = 1; } -class ThreeOpFrag : ThreeOpFragSDAG { +// Matches (op2 (op1 x, y), z) if op1IsRight = 0 and +// matches (op2 z, (op1, x, y)) if op1IsRight = 1. +class ThreeOpFrag : ThreeOpFragSDAG { // The divergence predicate is irrelevant in GlobalISel, as we have // proper register bank checks. We just need to verify the constant // bus restriction when all the sources are considered. @@ -806,12 +810,19 @@ def : GCNPat< (DivergentBinFrag i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; -let SubtargetPredicate = isGFX940Plus in +let SubtargetPredicate = isGFX940Plus in { def : GCNPat< (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) >; +def : GCNPat < + // (ptradd z, (shl x, y)) -> ((x << y) + z) + (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), + (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) +>; +} // End SubtargetPredicate = isGFX940Plus + def : VOPBinOpClampPat; def : VOPBinOpClampPat; @@ -880,19 +891,24 @@ multiclass IMAD32_Pats { // Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul. // We need to separate this because otherwise OtherPredicates would be overriden. -class IMAD32_Mul24_Pat: GCNPat < -(i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)), -(inst $src0, $src1, $src2, 0 /* clamp */) ->; +class IMAD32_Mul24_Pats_Impl : GCNPat < +!if(mulIsRight, (i64 (AddOp i64:$src2, (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1, +(i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2))), +(inst $src0, $src1, $src2, 0 /* clamp */)>; + +multiclass IMAD32_Mul24_Pats { + def : IMAD32_Mul24_Pats_Impl; + def : IMAD32_Mul24_Pats_Impl; +} // exclude pre-GFX9 where it was slow let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in { defm : IMAD32_Pats; - def : IMAD32_Mul24_Pat; + defm : IMAD32_Mul24_Pats; } let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in { defm : IMAD32_Pats; - def : IMAD32_Mul24_Pat; + defm : IMAD32_Mul24_Pats; } def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index d48bfe0bb7f21..34bb98550de04 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -266,18 +266,11 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) { ; Use non-zero shift amounts in v_lshl_add_u64. define ptr @select_v_lshl_add_u64(ptr %base,
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Handle ISD::PTRADD in SelectionDAGAddressAnalysis (PR #142778)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142778 >From c959592b27205064e3b6f53c7330032bce84f857 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 09:48:02 -0400 Subject: [PATCH] [AMDGPU][SDAG] Handle ISD::PTRADD in SelectionDAGAddressAnalysis This is used in a bunch of memory-related transforms. For SWDEV-516125. --- .../SelectionDAGAddressAnalysis.cpp | 6 ++-- .../AMDGPU/ptradd-sdag-optimizations.ll | 28 ++- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index f2ab88851b780..da92aaa860b2b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -231,6 +231,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, } break; case ISD::ADD: +case ISD::PTRADD: if (auto *C = dyn_cast(Base->getOperand(1))) { Offset += C->getSExtValue(); Base = DAG.getTargetLoweringInfo().unwrapAddress(Base->getOperand(0)); @@ -259,7 +260,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, break; } - if (Base->getOpcode() == ISD::ADD) { + if (Base->isAnyAdd()) { // TODO: The following code appears to be needless as it just // bails on some Ptrs early, reducing the cases where we // find equivalence. We should be able to remove this. @@ -282,8 +283,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, } // Check if Index Offset pattern -if (Index->getOpcode() != ISD::ADD || -!isa(Index->getOperand(1))) +if (!Index->isAnyAdd() || !isa(Index->getOperand(1))) return BaseIndexOffset(PotentialBase, Index, Offset, IsIndexSignExt); Offset += cast(Index->getOperand(1))->getSExtValue(); diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index d3242905ada64..2e76033a480f4 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -130,26 +130,14 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; Taken from memcpy-param-combinations.ll, tests PTRADD handling in ; SelectionDAGAddressAnalysis. define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { -; GFX942_PTRADD-LABEL: memcpy_p1_p4_sz16_align_1_1: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT:global_load_dwordx2 v[4:5], v[2:3], off -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[4:5], off -; GFX942_PTRADD-NEXT:global_load_dwordx2 v[2:3], v[2:3], off offset:8 -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[2:3], off offset:8 -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: memcpy_p1_p4_sz16_align_1_1: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT:global_load_dwordx4 v[2:5], v[2:3], off -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT:global_store_dwordx4 v[0:1], v[2:5], off -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] +; GFX942-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:global_load_dwordx4 v[2:5], v[2:3], off +; GFX942-NEXT:s_waitcnt vmcnt(0) +; GFX942-NEXT:global_store_dwordx4 v[0:1], v[2:5], off +; GFX942-NEXT:s_waitcnt vmcnt(0) +; GFX942-NEXT:s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) ret void ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines (PR #143673)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143673 >From a3d204e9a8aae5de008a83904215d44d8d0c3380 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 11 Jun 2025 05:48:45 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125. --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 151 ++ .../AMDGPU/ptradd-sdag-optimizations.ll | 151 ++ 3 files changed, 167 insertions(+), 139 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 45a37622a531b..1210777428020 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6706,7 +6706,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT, return SDValue(); int64_t Offset = C2->getSExtValue(); switch (Opcode) { - case ISD::ADD: break; + case ISD::ADD: + case ISD::PTRADD: +break; case ISD::SUB: Offset = -uint64_t(Offset); break; default: return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 184984abcdf32..fe002b3daed89 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -46,6 +47,7 @@ #include using namespace llvm; +using namespace llvm::SDPatternMatch; #define DEBUG_TYPE "si-lower" @@ -14329,7 +14331,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, // instead of a tree. SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::ADD); + assert(N->isAnyAdd()); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -14362,7 +14364,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, for (SDNode *User : LHS->users()) { // There is a use that does not feed into addition, so the multiply can't // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. - if (User->getOpcode() != ISD::ADD) + if (!User->isAnyAdd()) return SDValue(); // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer @@ -14474,8 +14476,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, SDValue Hi = getHiHalf64(LHS, DAG); SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); +unsigned Opcode = N->getOpcode(); +if (Opcode == ISD::PTRADD) + Opcode = ISD::ADD; SDValue AddHi = -DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); +DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); @@ -14949,44 +14954,120 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::ADD) { -// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, -//y is not, and (add y, z) is used only once. -// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, -//z is not, and (add y, z) is used only once. -// The goal is to move constant offsets to the outermost ptradd, to create -// more opportunities to fold offsets into memory instructions. -// Together with the generic combines in DAGCombiner.cpp, this also -// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). -// -// This transform is here instead of in the general DAGCombiner as it can -// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for -// AArch64's CPA. -SDValue X = N0; -SDValue Y = N1.getOperand(0); -SDValue Z = N1.getOperand(1); -bool N1OneUse = N1.hasOneUse(); -bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); -bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); -if ((ZIsConstant != YIsConstant) && N1OneUse) { - SDNodeFlags Flags; - // If both additions in the original we
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Tests for target-specific ISD::PTRADD combines (PR #143672)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143672 >From c9cbbce907dc77f1580019bb78ae3c175f99af37 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 11 Jun 2025 05:14:34 -0400 Subject: [PATCH] [AMDGPU][SDAG] Tests for target-specific ISD::PTRADD combines Pre-committing tests to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 176 ++ 1 file changed, 176 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 2e76033a480f4..1ec94162951a6 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -142,3 +142,179 @@ entry: tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) ret void } + +; Test skipping the lower-32-bit addition if it is unnecessary. +define ptr @huge_offset_low_32_unused(ptr %p) { +; GFX942_PTRADD-LABEL: huge_offset_low_32_unused: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:s_mov_b32 s0, 0 +; GFX942_PTRADD-NEXT:s_mov_b32 s1, 1 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: huge_offset_low_32_unused: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_add_u32_e32 v1, 1, v1 +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep = getelementptr inbounds i8, ptr %p, i64 u0x1 + ret ptr %gep +} + +; Reassociate address computation if it leads to more scalar operations. +define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) { +; GFX942_PTRADD-LABEL: reassoc_scalar_r: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_PTRADD-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_PTRADD-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7] +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_PTRADD-NEXT:s_endpgm +; +; GFX942_LEGACY-LABEL: reassoc_scalar_r: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_LEGACY-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_LEGACY-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_LEGACY-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_LEGACY-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_LEGACY-NEXT:s_add_u32 s2, s2, s6 +; GFX942_LEGACY-NEXT:s_addc_u32 s3, s3, s7 +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_LEGACY-NEXT:s_endpgm +entry: + %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() + %voffset = zext i32 %voffset32 to i64 + %offset = add nuw nsw i64 %voffset, %soffset + %gep = getelementptr i8, ptr addrspace(1) %p, i64 %offset + store ptr addrspace(1) %gep, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) { +; GFX942_PTRADD-LABEL: reassoc_scalar_l: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_PTRADD-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_PTRADD-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1] +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_PTRADD-NEXT:s_endpgm +; +; GFX942_LEGACY-LABEL: reassoc_scalar_l: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_LEGACY-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_LEGACY-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_LEGACY-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_LEGACY-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_LEGACY-NEXT:s_add_u32 s2, s2, s6 +; GFX942_LEGACY-NEXT:s_addc_u32 s3, s3, s7 +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_LEGACY-NEXT:s_endpgm +entry: + %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() + %voffset = zext i32 %voffset32 to i64 + %offset = add nuw nsw i64 %soffset, %voffset + %gep = getelementptr i8, ptr addrspace(1) %p, i64 %offset + store ptr addrspace(1) %gep, ptr addrspace
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142738 >From cdea7dfe63d04d4b2879d7f73408753ff70e20dc Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Tue, 3 Jun 2025 09:49:19 -0400 Subject: [PATCH 1/2] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines Pre-committing tests to show improvements in a follow-up PR with the combines. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 207 ++ 1 file changed, 207 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll new file mode 100644 index 0..0241be9197e1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s + +; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG +; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable +; similar transformations in that pass. + +; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use. +define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_ZTwoUses: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_ZTwoUses: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24 + %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset + %l = load i64, ptr addrspace(1) %gep1, align 8 + %r = add i64 %l, %voffset + ret i64 %r +} + +define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %add0 = add nuw nsw i64 %voffset, 24 + %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 + %l = load i64, ptr addrspace(1) %gep0, align 8 + ret i64 %l +} + +; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These +; would be folded away in most cases, but the index computation introduced by +; the legalization of wide vector stores can for example introduce them. +define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) { +; GFX942_PTRADD-LABEL: store_v16i32: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23 +; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX942_PTRADD-NEXT:s_nop 1 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16 +; GFX942_PTRADD-NEXT:v_mo
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines (PR #142738)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142738 >From cdea7dfe63d04d4b2879d7f73408753ff70e20dc Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Tue, 3 Jun 2025 09:49:19 -0400 Subject: [PATCH 1/2] [AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines Pre-committing tests to show improvements in a follow-up PR with the combines. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 207 ++ 1 file changed, 207 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll new file mode 100644 index 0..0241be9197e1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s + +; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG +; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable +; similar transformations in that pass. + +; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use. +define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_ZTwoUses: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_ZTwoUses: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24 + %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset + %l = load i64, ptr addrspace(1) %gep1, align 8 + %r = add i64 %l, %voffset + ret i64 %r +} + +define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[2:3], 0, 24 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %add0 = add nuw nsw i64 %voffset, 24 + %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 + %l = load i64, ptr addrspace(1) %gep0, align 8 + ret i64 %l +} + +; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These +; would be folded away in most cases, but the index computation introduced by +; the legalization of wide vector stores can for example introduce them. +define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) { +; GFX942_PTRADD-LABEL: store_v16i32: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v4, 0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:s_add_u32 s2, s0, 32 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s20 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, s21 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v2, s22 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v3, s23 +; GFX942_PTRADD-NEXT:s_addc_u32 s3, s1, 0 +; GFX942_PTRADD-NEXT:global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX942_PTRADD-NEXT:s_nop 1 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v0, s16 +; GFX942_PTRADD-NEXT:v_mo
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142739 >From 743ecdf0cf69d300859d6817fa4a9c48218aa9e5 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 03:32:32 -0400 Subject: [PATCH 1/5] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one that is closely connected. The generic DAG combine is based on a part of PR #105669 by @rgwott, which was adapted from work by @jrtc27, @arichardson, @davidchisnall in the CHERI/Morello LLVM tree. I added some parts and removed several disjuncts from the reassociation condition: - `isNullConstant(X)`, since there are address spaces where 0 is a perfectly normal value that shouldn't be treated specially, - `(YIsConstant && ZOneUse)` and `(N0OneUse && ZOneUse && !ZIsConstant)`, since they cause regressions in AMDGPU. For SWDEV-516125. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 92 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 49 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + .../AMDGPU/ptradd-sdag-optimizations.ll | 194 ++ 4 files changed, 201 insertions(+), 135 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5d62ded171f4f..505cb264ae948 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -419,6 +419,7 @@ namespace { SDValue visitADDLike(SDNode *N); SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); +SDValue visitPTRADD(SDNode *N); SDValue visitSUB(SDNode *N); SDValue visitADDSAT(SDNode *N); SDValue visitSUBSAT(SDNode *N); @@ -1138,7 +1139,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, return true; } - if (Opc != ISD::ADD) + if (Opc != ISD::ADD && Opc != ISD::PTRADD) return false; auto *C2 = dyn_cast(N1); @@ -1858,6 +1859,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::TokenFactor:return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD:return visitADD(N); + case ISD::PTRADD: return visitPTRADD(N); case ISD::SUB:return visitSUB(N); case ISD::SADDSAT: case ISD::UADDSAT:return visitADDSAT(N); @@ -2628,6 +2630,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) +return N1; + + if (N0.getOpcode() == ISD::PTRADD && + !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) { +SDValue X = N0.getOperand(0); +SDValue Y = N0.getOperand(1); +SDValue Z = N1; +bool N0OneUse = N0.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + +// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if: +// * y is a constant and (ptradd x, y) has one use; or +// * y and z are both constants. +if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && + N0->getFlags().hasNoUnsignedWrap()) +Flags |= SDNodeFlags::NoUnsignedWrap; + SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); + AddToWorklist(Add.getNode()); + return DAG.getMemBasePlusOffset(X, Add, DL, Flags); +} + +// TODO: There is another possible fold here that was proven useful. +// It would be this: +// +// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if: +// * (ptradd x, y) has one use; and +// * y is a constant; and +// * z is not a constant. +// +// In some cases, specifically in AArch64's FEAT_CPA, it exposes the +// opportunity to select more complex instructions such as SUBPT and +// MSUBPT. H
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142739 >From 743ecdf0cf69d300859d6817fa4a9c48218aa9e5 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 03:32:32 -0400 Subject: [PATCH 1/5] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one that is closely connected. The generic DAG combine is based on a part of PR #105669 by @rgwott, which was adapted from work by @jrtc27, @arichardson, @davidchisnall in the CHERI/Morello LLVM tree. I added some parts and removed several disjuncts from the reassociation condition: - `isNullConstant(X)`, since there are address spaces where 0 is a perfectly normal value that shouldn't be treated specially, - `(YIsConstant && ZOneUse)` and `(N0OneUse && ZOneUse && !ZIsConstant)`, since they cause regressions in AMDGPU. For SWDEV-516125. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 92 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 49 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + .../AMDGPU/ptradd-sdag-optimizations.ll | 194 ++ 4 files changed, 201 insertions(+), 135 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5d62ded171f4f..505cb264ae948 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -419,6 +419,7 @@ namespace { SDValue visitADDLike(SDNode *N); SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); +SDValue visitPTRADD(SDNode *N); SDValue visitSUB(SDNode *N); SDValue visitADDSAT(SDNode *N); SDValue visitSUBSAT(SDNode *N); @@ -1138,7 +1139,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, return true; } - if (Opc != ISD::ADD) + if (Opc != ISD::ADD && Opc != ISD::PTRADD) return false; auto *C2 = dyn_cast(N1); @@ -1858,6 +1859,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::TokenFactor:return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD:return visitADD(N); + case ISD::PTRADD: return visitPTRADD(N); case ISD::SUB:return visitSUB(N); case ISD::SADDSAT: case ISD::UADDSAT:return visitADDSAT(N); @@ -2628,6 +2630,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) +return N1; + + if (N0.getOpcode() == ISD::PTRADD && + !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) { +SDValue X = N0.getOperand(0); +SDValue Y = N0.getOperand(1); +SDValue Z = N1; +bool N0OneUse = N0.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + +// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if: +// * y is a constant and (ptradd x, y) has one use; or +// * y and z are both constants. +if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && + N0->getFlags().hasNoUnsignedWrap()) +Flags |= SDNodeFlags::NoUnsignedWrap; + SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); + AddToWorklist(Add.getNode()); + return DAG.getMemBasePlusOffset(X, Add, DL, Flags); +} + +// TODO: There is another possible fold here that was proven useful. +// It would be this: +// +// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if: +// * (ptradd x, y) has one use; and +// * y is a constant; and +// * z is not a constant. +// +// In some cases, specifically in AArch64's FEAT_CPA, it exposes the +// opportunity to select more complex instructions such as SUBPT and +// MSUBPT. H
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Test ISD::PTRADD handling in VOP3 patterns (PR #143880)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143880 >From 99d65b3e0a8627b581673b55505962665a3ffcb6 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Thu, 12 Jun 2025 06:13:26 -0400 Subject: [PATCH] [AMDGPU][SDAG] Test ISD::PTRADD handling in VOP3 patterns Pre-committing tests to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 45 +++ 1 file changed, 45 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index c00bccdbce6b7..d48bfe0bb7f21 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -263,3 +263,48 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) { store float 1.0, ptr addrspace(1) %p1 ret void } + +; Use non-zero shift amounts in v_lshl_add_u64. +define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: select_v_lshl_add_u64: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshlrev_b64 v[2:3], 3, v[2:3] +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: select_v_lshl_add_u64: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep = getelementptr inbounds i64, ptr %base, i64 %voffset + ret ptr %gep +} + +; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the +; mul into a mul24. +define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) { +; GFX942_PTRADD-LABEL: fold_mul24_into_mad: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_and_b32_e32 v2, 0xf, v2 +; GFX942_PTRADD-NEXT:v_and_b32_e32 v4, 0xf, v4 +; GFX942_PTRADD-NEXT:v_mul_hi_u32_u24_e32 v3, v2, v4 +; GFX942_PTRADD-NEXT:v_mul_u32_u24_e32 v2, v2, v4 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: fold_mul24_into_mad: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_and_b32_e32 v2, 0xf, v2 +; GFX942_LEGACY-NEXT:v_and_b32_e32 v3, 0xf, v4 +; GFX942_LEGACY-NEXT:v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %a_masked = and i64 %a, u0xf + %b_masked = and i64 %b, u0xf + %mul = mul i64 %a_masked, %b_masked + %gep = getelementptr inbounds i8, ptr %base, i64 %mul + ret ptr %gep +} ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Handle ISD::PTRADD in VOP3 patterns (PR #143881)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143881 >From f93590bac710750f993c86005c217b843cc5a863 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Thu, 12 Jun 2025 07:44:37 -0400 Subject: [PATCH] [AMDGPU][SDAG] Handle ISD::PTRADD in VOP3 patterns This patch mirrors similar patterns for ISD::ADD. The main difference is that ISD::ADD is commutative, so that a pattern definition for, e.g., (add (mul x, y), z), automatically also handles (add z, (mul x, y)). ISD::PTRADD is not commutative, so we would need to handle these cases explicitly. This patch only implements (ptradd z, (op x, y)) patterns, where the nested operation (shift or multiply) is the offset of the ptradd (i.e., the right operand), since base pointers that are the result of a shift or multiply seem less likely. For SWDEV-516125. --- llvm/lib/Target/AMDGPU/VOP3Instructions.td| 36 +++- .../AMDGPU/ptradd-sdag-optimizations.ll | 41 ++ llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll | 42 +++ 3 files changed, 52 insertions(+), 67 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index a005e0245b8ff..8054e75782539 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -484,12 +484,13 @@ let OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue defm: Ternary_i16_Pats_gfx9; } // End OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue16BitInsts -class ThreeOpFragSDAG : PatFrag< +class ThreeOpFragSDAG : PatFrag< (ops node:$x, node:$y, node:$z), // When the inner operation is used multiple times, selecting 3-op // instructions may still be beneficial -- if the other users can be // combined similarly. Let's be conservative for now. - (op2 (HasOneUseBinOp node:$x, node:$y), node:$z), + !if(op1IsRight, (op2 node:$z, (HasOneUseBinOp node:$x, node:$y)), + (op2 (HasOneUseBinOp node:$x, node:$y), node:$z)), [{ // Only use VALU ops when the result is divergent. if (!N->isDivergent()) @@ -516,7 +517,10 @@ class ThreeOpFragSDAG : PatFrag< let PredicateCodeUsesOperands = 1; } -class ThreeOpFrag : ThreeOpFragSDAG { +// Matches (op2 (op1 x, y), z) if op1IsRight = 0 and +// matches (op2 z, (op1, x, y)) if op1IsRight = 1. +class ThreeOpFrag : ThreeOpFragSDAG { // The divergence predicate is irrelevant in GlobalISel, as we have // proper register bank checks. We just need to verify the constant // bus restriction when all the sources are considered. @@ -806,12 +810,19 @@ def : GCNPat< (DivergentBinFrag i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; -let SubtargetPredicate = isGFX940Plus in +let SubtargetPredicate = isGFX940Plus in { def : GCNPat< (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) >; +def : GCNPat < + // (ptradd z, (shl x, y)) -> ((x << y) + z) + (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), + (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) +>; +} // End SubtargetPredicate = isGFX940Plus + def : VOPBinOpClampPat; def : VOPBinOpClampPat; @@ -880,19 +891,24 @@ multiclass IMAD32_Pats { // Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul. // We need to separate this because otherwise OtherPredicates would be overriden. -class IMAD32_Mul24_Pat: GCNPat < -(i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)), -(inst $src0, $src1, $src2, 0 /* clamp */) ->; +class IMAD32_Mul24_Pats_Impl : GCNPat < +!if(mulIsRight, (i64 (AddOp i64:$src2, (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1, +(i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2))), +(inst $src0, $src1, $src2, 0 /* clamp */)>; + +multiclass IMAD32_Mul24_Pats { + def : IMAD32_Mul24_Pats_Impl; + def : IMAD32_Mul24_Pats_Impl; +} // exclude pre-GFX9 where it was slow let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in { defm : IMAD32_Pats; - def : IMAD32_Mul24_Pat; + defm : IMAD32_Mul24_Pats; } let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in { defm : IMAD32_Pats; - def : IMAD32_Mul24_Pat; + defm : IMAD32_Mul24_Pats; } def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index d48bfe0bb7f21..34bb98550de04 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -266,18 +266,11 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) { ; Use non-zero shift amounts in v_lshl_add_u64. define ptr @select_v_lshl_add_u64(ptr %base,
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis (PR #142777)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142777 >From df620d738a35bb2d52c4254a784b66431725206f Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 09:30:34 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis Pre-committing test to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 28 +++ 1 file changed, 28 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index b78dea1684545..d3242905ada64 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -126,3 +126,31 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { store volatile i64 %dispatch.id, ptr addrspace(1) %ptr ret void } + +; Taken from memcpy-param-combinations.ll, tests PTRADD handling in +; SelectionDAGAddressAnalysis. +define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { +; GFX942_PTRADD-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[4:5], v[2:3], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[4:5], off +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[2:3], v[2:3], off offset:8 +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[2:3], off offset:8 +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:global_load_dwordx4 v[2:5], v[2:3], off +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:global_store_dwordx4 v[0:1], v[2:5], off +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) + ret void +} ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Handle ISD::PTRADD in SelectionDAGAddressAnalysis (PR #142778)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142778 >From c959592b27205064e3b6f53c7330032bce84f857 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 09:48:02 -0400 Subject: [PATCH] [AMDGPU][SDAG] Handle ISD::PTRADD in SelectionDAGAddressAnalysis This is used in a bunch of memory-related transforms. For SWDEV-516125. --- .../SelectionDAGAddressAnalysis.cpp | 6 ++-- .../AMDGPU/ptradd-sdag-optimizations.ll | 28 ++- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index f2ab88851b780..da92aaa860b2b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -231,6 +231,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, } break; case ISD::ADD: +case ISD::PTRADD: if (auto *C = dyn_cast(Base->getOperand(1))) { Offset += C->getSExtValue(); Base = DAG.getTargetLoweringInfo().unwrapAddress(Base->getOperand(0)); @@ -259,7 +260,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, break; } - if (Base->getOpcode() == ISD::ADD) { + if (Base->isAnyAdd()) { // TODO: The following code appears to be needless as it just // bails on some Ptrs early, reducing the cases where we // find equivalence. We should be able to remove this. @@ -282,8 +283,7 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, } // Check if Index Offset pattern -if (Index->getOpcode() != ISD::ADD || -!isa(Index->getOperand(1))) +if (!Index->isAnyAdd() || !isa(Index->getOperand(1))) return BaseIndexOffset(PotentialBase, Index, Offset, IsIndexSignExt); Offset += cast(Index->getOperand(1))->getSExtValue(); diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index d3242905ada64..2e76033a480f4 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -130,26 +130,14 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; Taken from memcpy-param-combinations.ll, tests PTRADD handling in ; SelectionDAGAddressAnalysis. define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { -; GFX942_PTRADD-LABEL: memcpy_p1_p4_sz16_align_1_1: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT:global_load_dwordx2 v[4:5], v[2:3], off -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[4:5], off -; GFX942_PTRADD-NEXT:global_load_dwordx2 v[2:3], v[2:3], off offset:8 -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[2:3], off offset:8 -; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: memcpy_p1_p4_sz16_align_1_1: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT:global_load_dwordx4 v[2:5], v[2:3], off -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT:global_store_dwordx4 v[0:1], v[2:5], off -; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] +; GFX942-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:global_load_dwordx4 v[2:5], v[2:3], off +; GFX942-NEXT:s_waitcnt vmcnt(0) +; GFX942-NEXT:global_store_dwordx4 v[0:1], v[2:5], off +; GFX942-NEXT:s_waitcnt vmcnt(0) +; GFX942-NEXT:s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) ret void ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis (PR #142777)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142777 >From df620d738a35bb2d52c4254a784b66431725206f Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 09:30:34 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis Pre-committing test to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 28 +++ 1 file changed, 28 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index b78dea1684545..d3242905ada64 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -126,3 +126,31 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { store volatile i64 %dispatch.id, ptr addrspace(1) %ptr ret void } + +; Taken from memcpy-param-combinations.ll, tests PTRADD handling in +; SelectionDAGAddressAnalysis. +define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { +; GFX942_PTRADD-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[4:5], v[2:3], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[4:5], off +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[2:3], v[2:3], off offset:8 +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[2:3], off offset:8 +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:global_load_dwordx4 v[2:5], v[2:3], off +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:global_store_dwordx4 v[0:1], v[2:5], off +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) + ret void +} ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Tests for target-specific ISD::PTRADD combines (PR #143672)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143672 >From c9cbbce907dc77f1580019bb78ae3c175f99af37 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 11 Jun 2025 05:14:34 -0400 Subject: [PATCH] [AMDGPU][SDAG] Tests for target-specific ISD::PTRADD combines Pre-committing tests to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 176 ++ 1 file changed, 176 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 2e76033a480f4..1ec94162951a6 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -142,3 +142,179 @@ entry: tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) ret void } + +; Test skipping the lower-32-bit addition if it is unnecessary. +define ptr @huge_offset_low_32_unused(ptr %p) { +; GFX942_PTRADD-LABEL: huge_offset_low_32_unused: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:s_mov_b32 s0, 0 +; GFX942_PTRADD-NEXT:s_mov_b32 s1, 1 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: huge_offset_low_32_unused: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_add_u32_e32 v1, 1, v1 +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep = getelementptr inbounds i8, ptr %p, i64 u0x1 + ret ptr %gep +} + +; Reassociate address computation if it leads to more scalar operations. +define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) { +; GFX942_PTRADD-LABEL: reassoc_scalar_r: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_PTRADD-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_PTRADD-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7] +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_PTRADD-NEXT:s_endpgm +; +; GFX942_LEGACY-LABEL: reassoc_scalar_r: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_LEGACY-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_LEGACY-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_LEGACY-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_LEGACY-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_LEGACY-NEXT:s_add_u32 s2, s2, s6 +; GFX942_LEGACY-NEXT:s_addc_u32 s3, s3, s7 +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_LEGACY-NEXT:s_endpgm +entry: + %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() + %voffset = zext i32 %voffset32 to i64 + %offset = add nuw nsw i64 %voffset, %soffset + %gep = getelementptr i8, ptr addrspace(1) %p, i64 %offset + store ptr addrspace(1) %gep, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) { +; GFX942_PTRADD-LABEL: reassoc_scalar_l: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_PTRADD-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_PTRADD-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1] +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_PTRADD-NEXT:s_endpgm +; +; GFX942_LEGACY-LABEL: reassoc_scalar_l: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_LEGACY-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_LEGACY-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_LEGACY-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_LEGACY-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_LEGACY-NEXT:s_add_u32 s2, s2, s6 +; GFX942_LEGACY-NEXT:s_addc_u32 s3, s3, s7 +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_LEGACY-NEXT:s_endpgm +entry: + %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() + %voffset = zext i32 %voffset32 to i64 + %offset = add nuw nsw i64 %soffset, %voffset + %gep = getelementptr i8, ptr addrspace(1) %p, i64 %offset + store ptr addrspace(1) %gep, ptr addrspace
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines (PR #143673)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143673 >From a3d204e9a8aae5de008a83904215d44d8d0c3380 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 11 Jun 2025 05:48:45 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125. --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 151 ++ .../AMDGPU/ptradd-sdag-optimizations.ll | 151 ++ 3 files changed, 167 insertions(+), 139 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 45a37622a531b..1210777428020 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6706,7 +6706,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT, return SDValue(); int64_t Offset = C2->getSExtValue(); switch (Opcode) { - case ISD::ADD: break; + case ISD::ADD: + case ISD::PTRADD: +break; case ISD::SUB: Offset = -uint64_t(Offset); break; default: return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 184984abcdf32..fe002b3daed89 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -46,6 +47,7 @@ #include using namespace llvm; +using namespace llvm::SDPatternMatch; #define DEBUG_TYPE "si-lower" @@ -14329,7 +14331,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, // instead of a tree. SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::ADD); + assert(N->isAnyAdd()); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -14362,7 +14364,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, for (SDNode *User : LHS->users()) { // There is a use that does not feed into addition, so the multiply can't // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. - if (User->getOpcode() != ISD::ADD) + if (!User->isAnyAdd()) return SDValue(); // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer @@ -14474,8 +14476,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, SDValue Hi = getHiHalf64(LHS, DAG); SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); +unsigned Opcode = N->getOpcode(); +if (Opcode == ISD::PTRADD) + Opcode = ISD::ADD; SDValue AddHi = -DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); +DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); @@ -14949,44 +14954,120 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::ADD) { -// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, -//y is not, and (add y, z) is used only once. -// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, -//z is not, and (add y, z) is used only once. -// The goal is to move constant offsets to the outermost ptradd, to create -// more opportunities to fold offsets into memory instructions. -// Together with the generic combines in DAGCombiner.cpp, this also -// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). -// -// This transform is here instead of in the general DAGCombiner as it can -// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for -// AArch64's CPA. -SDValue X = N0; -SDValue Y = N1.getOperand(0); -SDValue Z = N1.getOperand(1); -bool N1OneUse = N1.hasOneUse(); -bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); -bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); -if ((ZIsConstant != YIsConstant) && N1OneUse) { - SDNodeFlags Flags; - // If both additions in the original we
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Test ISD::PTRADD handling in VOP3 patterns (PR #143880)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143880 >From 99d65b3e0a8627b581673b55505962665a3ffcb6 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Thu, 12 Jun 2025 06:13:26 -0400 Subject: [PATCH] [AMDGPU][SDAG] Test ISD::PTRADD handling in VOP3 patterns Pre-committing tests to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 45 +++ 1 file changed, 45 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index c00bccdbce6b7..d48bfe0bb7f21 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -263,3 +263,48 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) { store float 1.0, ptr addrspace(1) %p1 ret void } + +; Use non-zero shift amounts in v_lshl_add_u64. +define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: select_v_lshl_add_u64: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshlrev_b64 v[2:3], 3, v[2:3] +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: select_v_lshl_add_u64: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep = getelementptr inbounds i64, ptr %base, i64 %voffset + ret ptr %gep +} + +; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the +; mul into a mul24. +define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) { +; GFX942_PTRADD-LABEL: fold_mul24_into_mad: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_and_b32_e32 v2, 0xf, v2 +; GFX942_PTRADD-NEXT:v_and_b32_e32 v4, 0xf, v4 +; GFX942_PTRADD-NEXT:v_mul_hi_u32_u24_e32 v3, v2, v4 +; GFX942_PTRADD-NEXT:v_mul_u32_u24_e32 v2, v2, v4 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: fold_mul24_into_mad: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_and_b32_e32 v2, 0xf, v2 +; GFX942_LEGACY-NEXT:v_and_b32_e32 v3, 0xf, v4 +; GFX942_LEGACY-NEXT:v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1] +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %a_masked = and i64 %a, u0xf + %b_masked = and i64 %b, u0xf + %mul = mul i64 %a_masked, %b_masked + %gep = getelementptr inbounds i8, ptr %base, i64 %mul + ret ptr %gep +} ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [RISCV] Support non-power-of-2 types when expanding memcmp (PR #114971)
@@ -2954,20 +2954,13 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { } if (IsZeroCmp && ST->hasVInstructions()) { -unsigned RealMinVLen = ST->getRealMinVLen(); -// Support Fractional LMULs if the lengths are larger than XLen. -// TODO: Support non-power-of-2 types. -for (unsigned FLMUL = 8; FLMUL >= 2; FLMUL /= 2) { - unsigned Len = RealMinVLen / FLMUL; - if (Len > ST->getXLen()) -Options.LoadSizes.insert(Options.LoadSizes.begin(), Len / 8); -} -for (unsigned LMUL = 1; LMUL <= ST->getMaxLMULForFixedLengthVectors(); - LMUL *= 2) { - unsigned Len = RealMinVLen * LMUL; - if (Len > ST->getXLen()) -Options.LoadSizes.insert(Options.LoadSizes.begin(), Len / 8); -} +unsigned VLenB = ST->getRealMinVLen() / 8; +// The minimum size should be the maximum bytes between `VLen * LMUL_MF8` +// and `XLen * 2`. +unsigned MinSize = std::max(VLenB / 8, ST->getXLen() * 2 / 8); lukel97 wrote: If that's the case, do we even need the LMUL check? I.e. can we just do ``` unsigned MinSize = ST->getXLen() + 1; ``` And presumably for sizes < MF8, lowering will use the correct container anyway? https://github.com/llvm/llvm-project/pull/114971 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -2628,6 +2630,87 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0) && PtrVT == IntVT) +return N1; arsenm wrote: ```suggestion // fold (ptradd 0, x) -> x if (PtrVT == IntVT && isNullConstant(N0)) return N1; ``` But PtrVT == IntVT was already asserted above? https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -14944,6 +14945,51 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performPtrAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N1.getOpcode() == ISD::ADD) { +// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, +//y is not, and (add y, z) is used only once. +// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, +//z is not, and (add y, z) is used only once. +// The goal is to move constant offsets to the outermost ptradd, to create +// more opportunities to fold offsets into memory instructions. +// Together with the generic combines in DAGCombiner.cpp, this also +// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). +// +// This transform is here instead of in the general DAGCombiner as it can +// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for +// AArch64's CPA. +SDValue X = N0; +SDValue Y = N1.getOperand(0); +SDValue Z = N1.getOperand(1); +bool N1OneUse = N1.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); +if ((ZIsConstant != YIsConstant) && N1OneUse) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && + N1->getFlags().hasNoUnsignedWrap()) +Flags |= SDNodeFlags::NoUnsignedWrap; arsenm wrote: Can you do SDNodeFlags = (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap? https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
@@ -14944,6 +14945,51 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performPtrAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N1.getOpcode() == ISD::ADD) { +// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, +//y is not, and (add y, z) is used only once. +// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, +//z is not, and (add y, z) is used only once. +// The goal is to move constant offsets to the outermost ptradd, to create +// more opportunities to fold offsets into memory instructions. +// Together with the generic combines in DAGCombiner.cpp, this also +// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). +// +// This transform is here instead of in the general DAGCombiner as it can +// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for +// AArch64's CPA. +SDValue X = N0; +SDValue Y = N1.getOperand(0); +SDValue Z = N1.getOperand(1); +bool N1OneUse = N1.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); +if ((ZIsConstant != YIsConstant) && N1OneUse) { arsenm wrote: Avoid the DAG.isConstantIntBuildVectorOrConstantInt in the !N1OneUse case? https://github.com/llvm/llvm-project/pull/142739 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis (PR #142777)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/142777 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines (PR #143673)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143673 >From 50de6e085242ce975af812088f4ef48896444fb6 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 11 Jun 2025 05:48:45 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125. --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 151 ++ .../AMDGPU/ptradd-sdag-optimizations.ll | 151 ++ 3 files changed, 167 insertions(+), 139 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 45a37622a531b..1210777428020 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6706,7 +6706,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT, return SDValue(); int64_t Offset = C2->getSExtValue(); switch (Opcode) { - case ISD::ADD: break; + case ISD::ADD: + case ISD::PTRADD: +break; case ISD::SUB: Offset = -uint64_t(Offset); break; default: return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 184984abcdf32..fe002b3daed89 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -46,6 +47,7 @@ #include using namespace llvm; +using namespace llvm::SDPatternMatch; #define DEBUG_TYPE "si-lower" @@ -14329,7 +14331,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, // instead of a tree. SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::ADD); + assert(N->isAnyAdd()); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -14362,7 +14364,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, for (SDNode *User : LHS->users()) { // There is a use that does not feed into addition, so the multiply can't // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. - if (User->getOpcode() != ISD::ADD) + if (!User->isAnyAdd()) return SDValue(); // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer @@ -14474,8 +14476,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, SDValue Hi = getHiHalf64(LHS, DAG); SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); +unsigned Opcode = N->getOpcode(); +if (Opcode == ISD::PTRADD) + Opcode = ISD::ADD; SDValue AddHi = -DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); +DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); @@ -14949,44 +14954,120 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::ADD) { -// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, -//y is not, and (add y, z) is used only once. -// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, -//z is not, and (add y, z) is used only once. -// The goal is to move constant offsets to the outermost ptradd, to create -// more opportunities to fold offsets into memory instructions. -// Together with the generic combines in DAGCombiner.cpp, this also -// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). -// -// This transform is here instead of in the general DAGCombiner as it can -// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for -// AArch64's CPA. -SDValue X = N0; -SDValue Y = N1.getOperand(0); -SDValue Z = N1.getOperand(1); -bool N1OneUse = N1.hasOneUse(); -bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); -bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); -if ((ZIsConstant != YIsConstant) && N1OneUse) { - SDNodeFlags Flags; - // If both additions in the original we
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142739 >From 3002da1befde734af1904d3424abd72b65f1377b Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 03:32:32 -0400 Subject: [PATCH 1/5] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one that is closely connected. The generic DAG combine is based on a part of PR #105669 by @rgwott, which was adapted from work by @jrtc27, @arichardson, @davidchisnall in the CHERI/Morello LLVM tree. I added some parts and removed several disjuncts from the reassociation condition: - `isNullConstant(X)`, since there are address spaces where 0 is a perfectly normal value that shouldn't be treated specially, - `(YIsConstant && ZOneUse)` and `(N0OneUse && ZOneUse && !ZIsConstant)`, since they cause regressions in AMDGPU. For SWDEV-516125. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 92 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 49 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + .../AMDGPU/ptradd-sdag-optimizations.ll | 194 ++ 4 files changed, 201 insertions(+), 135 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5d62ded171f4f..505cb264ae948 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -419,6 +419,7 @@ namespace { SDValue visitADDLike(SDNode *N); SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); +SDValue visitPTRADD(SDNode *N); SDValue visitSUB(SDNode *N); SDValue visitADDSAT(SDNode *N); SDValue visitSUBSAT(SDNode *N); @@ -1138,7 +1139,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, return true; } - if (Opc != ISD::ADD) + if (Opc != ISD::ADD && Opc != ISD::PTRADD) return false; auto *C2 = dyn_cast(N1); @@ -1858,6 +1859,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::TokenFactor:return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD:return visitADD(N); + case ISD::PTRADD: return visitPTRADD(N); case ISD::SUB:return visitSUB(N); case ISD::SADDSAT: case ISD::UADDSAT:return visitADDSAT(N); @@ -2628,6 +2630,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) +return N1; + + if (N0.getOpcode() == ISD::PTRADD && + !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) { +SDValue X = N0.getOperand(0); +SDValue Y = N0.getOperand(1); +SDValue Z = N1; +bool N0OneUse = N0.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + +// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if: +// * y is a constant and (ptradd x, y) has one use; or +// * y and z are both constants. +if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && + N0->getFlags().hasNoUnsignedWrap()) +Flags |= SDNodeFlags::NoUnsignedWrap; + SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); + AddToWorklist(Add.getNode()); + return DAG.getMemBasePlusOffset(X, Add, DL, Flags); +} + +// TODO: There is another possible fold here that was proven useful. +// It would be this: +// +// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if: +// * (ptradd x, y) has one use; and +// * y is a constant; and +// * z is not a constant. +// +// In some cases, specifically in AArch64's FEAT_CPA, it exposes the +// opportunity to select more complex instructions such as SUBPT and +// MSUBPT. H
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142739 >From 3002da1befde734af1904d3424abd72b65f1377b Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 03:32:32 -0400 Subject: [PATCH 1/5] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one that is closely connected. The generic DAG combine is based on a part of PR #105669 by @rgwott, which was adapted from work by @jrtc27, @arichardson, @davidchisnall in the CHERI/Morello LLVM tree. I added some parts and removed several disjuncts from the reassociation condition: - `isNullConstant(X)`, since there are address spaces where 0 is a perfectly normal value that shouldn't be treated specially, - `(YIsConstant && ZOneUse)` and `(N0OneUse && ZOneUse && !ZIsConstant)`, since they cause regressions in AMDGPU. For SWDEV-516125. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 92 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 49 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + .../AMDGPU/ptradd-sdag-optimizations.ll | 194 ++ 4 files changed, 201 insertions(+), 135 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5d62ded171f4f..505cb264ae948 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -419,6 +419,7 @@ namespace { SDValue visitADDLike(SDNode *N); SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); +SDValue visitPTRADD(SDNode *N); SDValue visitSUB(SDNode *N); SDValue visitADDSAT(SDNode *N); SDValue visitSUBSAT(SDNode *N); @@ -1138,7 +1139,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, return true; } - if (Opc != ISD::ADD) + if (Opc != ISD::ADD && Opc != ISD::PTRADD) return false; auto *C2 = dyn_cast(N1); @@ -1858,6 +1859,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::TokenFactor:return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD:return visitADD(N); + case ISD::PTRADD: return visitPTRADD(N); case ISD::SUB:return visitSUB(N); case ISD::SADDSAT: case ISD::UADDSAT:return visitADDSAT(N); @@ -2628,6 +2630,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd undef, y) -> undef + if (N0.isUndef()) +return N0; + + // fold (ptradd x, undef) -> undef + if (N1.isUndef()) +return DAG.getUNDEF(PtrVT); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) +return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0)) +return N1; + + if (N0.getOpcode() == ISD::PTRADD && + !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) { +SDValue X = N0.getOperand(0); +SDValue Y = N0.getOperand(1); +SDValue Z = N1; +bool N0OneUse = N0.hasOneUse(); +bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); +bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + +// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if: +// * y is a constant and (ptradd x, y) has one use; or +// * y and z are both constants. +if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && + N0->getFlags().hasNoUnsignedWrap()) +Flags |= SDNodeFlags::NoUnsignedWrap; + SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); + AddToWorklist(Add.getNode()); + return DAG.getMemBasePlusOffset(X, Add, DL, Flags); +} + +// TODO: There is another possible fold here that was proven useful. +// It would be this: +// +// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if: +// * (ptradd x, y) has one use; and +// * y is a constant; and +// * z is not a constant. +// +// In some cases, specifically in AArch64's FEAT_CPA, it exposes the +// opportunity to select more complex instructions such as SUBPT and +// MSUBPT. H
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis (PR #142777)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142777 >From c0eab936e1cab87636ae7c676d7232948cc35aef Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 4 Jun 2025 09:30:34 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add test for ISD::PTRADD handling in SelectionDAGAddressAnalysis Pre-committing test to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 28 +++ 1 file changed, 28 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index b78dea1684545..d3242905ada64 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -126,3 +126,31 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { store volatile i64 %dispatch.id, ptr addrspace(1) %ptr ret void } + +; Taken from memcpy-param-combinations.ll, tests PTRADD handling in +; SelectionDAGAddressAnalysis. +define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { +; GFX942_PTRADD-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[4:5], v[2:3], off +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[4:5], off +; GFX942_PTRADD-NEXT:global_load_dwordx2 v[2:3], v[2:3], off offset:8 +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:global_store_dwordx2 v[0:1], v[2:3], off offset:8 +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: memcpy_p1_p4_sz16_align_1_1: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:global_load_dwordx4 v[2:5], v[2:3], off +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:global_store_dwordx4 v[0:1], v[2:5], off +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) + ret void +} ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Tests for target-specific ISD::PTRADD combines (PR #143672)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143672 >From 37747657c81cc49feb345810b792f01e35d28511 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 11 Jun 2025 05:14:34 -0400 Subject: [PATCH] [AMDGPU][SDAG] Tests for target-specific ISD::PTRADD combines Pre-committing tests to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 176 ++ 1 file changed, 176 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 2e76033a480f4..1ec94162951a6 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -142,3 +142,179 @@ entry: tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) ret void } + +; Test skipping the lower-32-bit addition if it is unnecessary. +define ptr @huge_offset_low_32_unused(ptr %p) { +; GFX942_PTRADD-LABEL: huge_offset_low_32_unused: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT:s_mov_b32 s0, 0 +; GFX942_PTRADD-NEXT:s_mov_b32 s1, 1 +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942_PTRADD-NEXT:s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: huge_offset_low_32_unused: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT:v_add_u32_e32 v1, 1, v1 +; GFX942_LEGACY-NEXT:s_setpc_b64 s[30:31] + %gep = getelementptr inbounds i8, ptr %p, i64 u0x1 + ret ptr %gep +} + +; Reassociate address computation if it leads to more scalar operations. +define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) { +; GFX942_PTRADD-LABEL: reassoc_scalar_r: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_PTRADD-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_PTRADD-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7] +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_PTRADD-NEXT:s_endpgm +; +; GFX942_LEGACY-LABEL: reassoc_scalar_r: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_LEGACY-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_LEGACY-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_LEGACY-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_LEGACY-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_LEGACY-NEXT:s_add_u32 s2, s2, s6 +; GFX942_LEGACY-NEXT:s_addc_u32 s3, s3, s7 +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_LEGACY-NEXT:s_endpgm +entry: + %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() + %voffset = zext i32 %voffset32 to i64 + %offset = add nuw nsw i64 %voffset, %soffset + %gep = getelementptr i8, ptr addrspace(1) %p, i64 %offset + store ptr addrspace(1) %gep, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) { +; GFX942_PTRADD-LABEL: reassoc_scalar_l: +; GFX942_PTRADD: ; %bb.0: ; %entry +; GFX942_PTRADD-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_PTRADD-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_PTRADD-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_PTRADD-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_PTRADD-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1] +; GFX942_PTRADD-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3] +; GFX942_PTRADD-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_PTRADD-NEXT:s_endpgm +; +; GFX942_LEGACY-LABEL: reassoc_scalar_l: +; GFX942_LEGACY: ; %bb.0: ; %entry +; GFX942_LEGACY-NEXT:s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942_LEGACY-NEXT:s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942_LEGACY-NEXT:v_mov_b32_e32 v1, 0 +; GFX942_LEGACY-NEXT:v_and_b32_e32 v0, 0x3ff, v0 +; GFX942_LEGACY-NEXT:s_waitcnt lgkmcnt(0) +; GFX942_LEGACY-NEXT:s_add_u32 s2, s2, s6 +; GFX942_LEGACY-NEXT:s_addc_u32 s3, s3, s7 +; GFX942_LEGACY-NEXT:v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX942_LEGACY-NEXT:global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942_LEGACY-NEXT:s_endpgm +entry: + %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() + %voffset = zext i32 %voffset32 to i64 + %offset = add nuw nsw i64 %soffset, %voffset + %gep = getelementptr i8, ptr addrspace(1) %p, i64 %offset + store ptr addrspace(1) %gep, ptr addrspace
[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines (PR #143673)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143673 >From 50de6e085242ce975af812088f4ef48896444fb6 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 11 Jun 2025 05:48:45 -0400 Subject: [PATCH] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125. --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 151 ++ .../AMDGPU/ptradd-sdag-optimizations.ll | 151 ++ 3 files changed, 167 insertions(+), 139 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 45a37622a531b..1210777428020 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6706,7 +6706,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT, return SDValue(); int64_t Offset = C2->getSExtValue(); switch (Opcode) { - case ISD::ADD: break; + case ISD::ADD: + case ISD::PTRADD: +break; case ISD::SUB: Offset = -uint64_t(Offset); break; default: return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 184984abcdf32..fe002b3daed89 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -46,6 +47,7 @@ #include using namespace llvm; +using namespace llvm::SDPatternMatch; #define DEBUG_TYPE "si-lower" @@ -14329,7 +14331,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, // instead of a tree. SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::ADD); + assert(N->isAnyAdd()); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -14362,7 +14364,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, for (SDNode *User : LHS->users()) { // There is a use that does not feed into addition, so the multiply can't // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. - if (User->getOpcode() != ISD::ADD) + if (!User->isAnyAdd()) return SDValue(); // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer @@ -14474,8 +14476,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, SDValue Hi = getHiHalf64(LHS, DAG); SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); +unsigned Opcode = N->getOpcode(); +if (Opcode == ISD::PTRADD) + Opcode = ISD::ADD; SDValue AddHi = -DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); +DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); @@ -14949,44 +14954,120 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::ADD) { -// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, -//y is not, and (add y, z) is used only once. -// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, -//z is not, and (add y, z) is used only once. -// The goal is to move constant offsets to the outermost ptradd, to create -// more opportunities to fold offsets into memory instructions. -// Together with the generic combines in DAGCombiner.cpp, this also -// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). -// -// This transform is here instead of in the general DAGCombiner as it can -// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for -// AArch64's CPA. -SDValue X = N0; -SDValue Y = N1.getOperand(0); -SDValue Z = N1.getOperand(1); -bool N1OneUse = N1.hasOneUse(); -bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); -bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); -if ((ZIsConstant != YIsConstant) && N1OneUse) { - SDNodeFlags Flags; - // If both additions in the original we