[llvm-branch-commits] [llvm] AMDGPU: Custom expand flat cmpxchg which may access private (PR #109410)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/109410 >From 831b4a6dde281d7cd3b95557c15cb417d278d568 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 14 Aug 2024 13:57:14 +0400 Subject: [PATCH 1/2] AMDGPU: Custom expand flat cmpxchg which may access private 64-bit flat cmpxchg instructions do not work correctly for scratch addresses, and need to be expanded as non-atomic. Allow custom expansion of cmpxchg in AtomicExpand, as is already the case for atomicrmw. --- llvm/include/llvm/CodeGen/TargetLowering.h|5 + .../llvm/Transforms/Utils/LowerAtomic.h |7 + llvm/lib/CodeGen/AtomicExpandPass.cpp |4 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 146 ++- llvm/lib/Target/AMDGPU/SIISelLowering.h |3 + llvm/lib/Transforms/Utils/LowerAtomic.cpp | 21 +- llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 1027 +++-- ...expand-atomicrmw-flat-noalias-addrspace.ll |6 +- ...expand-atomicrmw-integer-ops-0-to-add-0.ll |6 +- .../expand-cmpxchg-flat-maybe-private.ll | 104 +- 10 files changed, 1161 insertions(+), 168 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 3842af56e6b3d7..678b169568afcf 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2204,6 +2204,11 @@ class TargetLoweringBase { "Generic atomicrmw expansion unimplemented on this target"); } + /// Perform a cmpxchg expansion using a target-specific method. + virtual void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const { +llvm_unreachable("Generic cmpxchg expansion unimplemented on this target"); + } + /// Perform a bit test atomicrmw using a target-specific intrinsic. This /// represents the combined bit test intrinsic which will be lowered at a late /// stage by the backend. diff --git a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h index b25b281667f9cb..295c2bd2b4b47e 100644 --- a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h +++ b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h @@ -23,6 +23,13 @@ class IRBuilderBase; /// Convert the given Cmpxchg into primitive load and compare. bool lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI); +/// Emit IR to implement the given cmpxchg operation on values in registers, +/// returning the new value. +std::pair buildAtomicCmpXchgValue(IRBuilderBase &Builder, +Value *Ptr, Value *Cmp, +Value *Val, +Align Alignment); + /// Convert the given RMWI into primitive load and stores, /// assuming that doing so is legal. Return true if the lowering /// succeeds. diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index b5eca44cb611a3..71e0fd2b7167a2 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -1672,6 +1672,10 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) { return true; case TargetLoweringBase::AtomicExpansionKind::NotAtomic: return lowerAtomicCmpXchgInst(CI); + case TargetLoweringBase::AtomicExpansionKind::Expand: { +TLI->emitExpandAtomicCmpXchg(CI); +return true; + } } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a4962399fc2815..17bd5c2343a4f1 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16504,9 +16504,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { - return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS - ? AtomicExpansionKind::NotAtomic - : AtomicExpansionKind::None; + unsigned AddrSpace = CmpX->getPointerAddressSpace(); + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) +return AtomicExpansionKind::NotAtomic; + + if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX)) +return AtomicExpansionKind::None; + + const DataLayout &DL = CmpX->getDataLayout(); + + Type *ValTy = CmpX->getNewValOperand()->getType(); + + // If a 64-bit flat atomic may alias private, we need to avoid using the + // atomic in the private case. + return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand + : AtomicExpansionKind::None; } const TargetRegisterClass * @@ -16670,40 +16682,8 @@ bool SITargetLowering::checkForPhysRegDependency( return false; } -void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { - AtomicRMWInst::BinOp Op = AI->getOperation(); - - if (Op == AtomicRMWInst::Sub || Op ==
[llvm-branch-commits] [llvm] AMDGPU: Add noalias.addrspace metadata when autoupgrading atomic intrinsics (PR #102599)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/102599 >From a2719d4938a1eaf135c275257b1b6c0318ccc801 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 9 Aug 2024 14:51:41 +0400 Subject: [PATCH] AMDGPU: Add noalias.addrspace metadata when autoupgrading atomic intrinsics This will be needed to continue generating the raw instruction in the flat case. --- llvm/lib/IR/AutoUpgrade.cpp| 13 - llvm/test/Bitcode/amdgcn-atomic.ll | 45 -- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 6f833acd6dbc0d..ca2602e56136a7 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -34,9 +34,11 @@ #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Regex.h" @@ -4235,13 +4237,22 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI, AtomicRMWInst *RMW = Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID); - if (PtrTy->getAddressSpace() != 3) { + unsigned AddrSpace = PtrTy->getAddressSpace(); + if (AddrSpace != AMDGPUAS::LOCAL_ADDRESS) { MDNode *EmptyMD = MDNode::get(F->getContext(), {}); RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD); if (RMWOp == AtomicRMWInst::FAdd && RetTy->isFloatTy()) RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD); } + if (AddrSpace == AMDGPUAS::FLAT_ADDRESS) { +MDBuilder MDB(F->getContext()); +MDNode *RangeNotPrivate = +MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS), +APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1)); +RMW->setMetadata(LLVMContext::MD_noalias_addrspace, RangeNotPrivate); + } + if (IsVolatile) RMW->setVolatile(true); diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll index d642372799f56b..87ca1e3a617ed9 100644 --- a/llvm/test/Bitcode/amdgcn-atomic.ll +++ b/llvm/test/Bitcode/amdgcn-atomic.ll @@ -2,10 +2,10 @@ define void @atomic_inc(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) { - ; CHECK: atomicrmw uinc_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ; CHECK: atomicrmw uinc_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !noalias.addrspace !0, !amdgpu.no.fine.grained.memory !1{{$}} %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false) - ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !1 %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false) ; CHECK: atomicrmw uinc_wrap ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4{{$}} @@ -26,10 +26,10 @@ define void @atomic_inc(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr } define void @atomic_dec(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) { - ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !noalias.addrspace !0, !amdgpu.no.fine.grained.memory !1{{$}} %result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false) - ; CHECK: atomicrmw udec_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ; CHECK: atomicrmw udec_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !1 %result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false) ; CHECK: atomicrmw udec_wrap ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4{{$}} @@ -51,49 +51,49 @@ define void @atomic_dec(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr ; Test some invalid ordering handling define void @ordering(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) { - ; CHECK: atomicrmw volatile uinc_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ; CHECK: atomicrmw volatile uinc_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !noalias.addrspace !0, !amdgpu.no.fine.grained.memory !1{{$}} %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr0, i32 42, i32 -1, i32 0, i1 true) - ; CHECK:
[llvm-branch-commits] [llvm] AMDGPU: Add baseline tests for cmpxchg custom expansion (PR #109408)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/109408 >From caecd58b94c52b5568fc0014dad1c51796e4d36e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 12 Sep 2024 12:44:04 +0400 Subject: [PATCH] AMDGPU: Add baseline tests for cmpxchg custom expansion We need a non-atomic path if flat may access private. --- .../AMDGPU/flat_atomics_i64_noprivate.ll | 34 +-- .../AtomicExpand/AMDGPU/expand-atomic-mmra.ll | 12 +- ...and-atomic-rmw-fadd-flat-specialization.ll | 4 +- ...expand-atomicrmw-flat-noalias-addrspace.ll | 149 - .../expand-cmpxchg-flat-maybe-private.ll | 208 ++ 5 files changed, 382 insertions(+), 25 deletions(-) create mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-cmpxchg-flat-maybe-private.ll diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll index c0b3adce81342d..f4fe003a34d3fb 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -5088,7 +5088,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT:s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5145,7 +5145,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT:s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 9000 - %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5206,7 +5206,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12-NEXT:s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 ret void @@ -5270,7 +5270,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5344,7 +5344,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 ret void @@ -5398,7 +5398,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GFX12-NEXT:global_inv scope:SCOPE_DEV ; GFX12-NEXT:s_endpgm entry: - %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5454,7 +5454,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GFX12-NEXT:flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT:s_endpgm entry: - %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 ret void @@ -5513,7 +5513,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX12-NEXT:s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5582,7 +5582,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT:s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst se
[llvm-branch-commits] [mlir] b3cdd66 - Revert "[MLIR][TilingInterface] Extend consumer fusion for multi-use of produ…"
Author: Abhishek Varma Date: 2024-09-30T14:51:23+05:30 New Revision: b3cdd66549a17e8ab83b23117d0a1fc9feb50534 URL: https://github.com/llvm/llvm-project/commit/b3cdd66549a17e8ab83b23117d0a1fc9feb50534 DIFF: https://github.com/llvm/llvm-project/commit/b3cdd66549a17e8ab83b23117d0a1fc9feb50534.diff LOG: Revert "[MLIR][TilingInterface] Extend consumer fusion for multi-use of produ…" This reverts commit b8c974f09391d78035928c599a911009bbe49e85. Added: Modified: mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir Removed: diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 50cfd29e6bf907..7cfd772a72b175 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -1481,29 +1481,21 @@ checkAssumptionForFusingConsumer(tensor::InsertSliceOp candidateSliceOp) { /// failure otherwise. static FailureOr getConsumerFromUses(Value val, Block *containingOpBlock) { - // Check that the value has exactly one use which isn't a scf.yield or a - // tensor.parallel_insert_slice op. - OpOperand *operand = nullptr; - for (OpOperand &opOperand : val.getUses()) { -Operation *consumerOp = opOperand.getOwner(); -if (isa(consumerOp)) - continue; -if (operand) - return failure(); -// TODO: We have to init result of consumer before scf.for, use -// DestinationStyleOpInterface to get result shape from init for now. -// Add support for other op such as op has InferTypeOpInterface. -if (!isa(consumerOp) || -!isa(consumerOp)) - return failure(); -if (containingOpBlock != consumerOp->getBlock()) - return failure(); -operand = &opOperand; - } - - if (operand) -return operand; - return failure(); + // Step 1. Check that the value has exactly one use. + if (!llvm::hasSingleElement(val.getUses())) +return failure(); + // Step 2. Get uses. + OpOperand &operand = (*val.getUses().begin()); + Operation *consumerOp = operand.getOwner(); + // TODO: We have to init result of consumer before scf.for, use + // DestinationStyleOpInterface to get result shape from init for now. + // Add support for other op such as op has InferTypeOpInterface. + if (!isa(consumerOp) || + !isa(consumerOp)) +return failure(); + if (containingOpBlock != consumerOp->getBlock()) +return failure(); + return &operand; } /// Find the perfectly nested loops outside of given loop(included) sorted from diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir index f5f703d95e2d5b..fdefdcc453ae7a 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir @@ -437,74 +437,3 @@ module attributes {transform.with_named_sequence} { // CHECK: scf.yield %[[LOOP_RESULT2]]#0, %[[LOOP_RESULT2]]#1 : // CHECK: } // CHECK: return %[[LOOP_RESULT1]]#1 : - -// - - -// This test case checks fusion of consumer even if the producer has multiple uses. -// The multiple uses of the producer essentially means that besides the consumer -// op in concern, the only other uses of the producer are allowed in :- -// 1. scf.yield -// 2. tensor.parallel_insert_slice - -module { - module { -func.func @fuse_consumer_for_multi_use_producer(%arg0: tensor<256x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256x256xf32>) -> (tensor<256x256xf32>, tensor<256x256xf32>) { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c256 = arith.constant 256 : index - %cst = arith.constant 0.00e+00 : f32 - %0 = tensor.empty() : tensor<256x256xf32> - %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32> - %2:2 = scf.for %arg3 = %c0 to %c256 step %c64 iter_args(%arg4 = %1, %arg5 = %arg2) -> (tensor<256x256xf32>, tensor<256x256xf32>) { -%3 = scf.for %arg6 = %c0 to %c256 step %c64 iter_args(%arg7 = %arg4) -> (tensor<256x256xf32>) { - %extracted_slice = tensor.extract_slice %arg7[%arg3, %arg6] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> - %extracted_slice_0 = tensor.extract_slice %arg0[%arg3, 0] [64, 512] [1, 1] : tensor<256x512xf32> to tensor<64x512xf32> - %extracted_slice_1 = tensor.extract_slice %arg1[0, %arg6] [512, 64] [1, 1] : tensor<512x256xf32> to tensor<512x64xf32> - %5 = linalg.matmul ins(%extracted_slice_0, %extracted_slice_1 : tensor<64x512xf32>, tensor<512x64xf32>) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32> - %ins
[llvm-branch-commits] [llvm] release/19.x: AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (#110256) (PR #110470)
https://github.com/llvmbot milestoned https://github.com/llvm/llvm-project/pull/110470 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (#110256) (PR #110470)
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/110470 Backport e9d12a6b451bd403d95105aa976a011dc821f126 83fe85115da9dc25fa270d2ea8140113c8d49670 Requested by: @arsenm >From 4477e7b862c603da7586598248e4ea0c60c81407 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Mon, 30 Sep 2024 10:39:17 +0200 Subject: [PATCH 1/2] AMDGPU: Add test for 16 bit unsigned scratch offsets (#110255) Large scratch offset with one on highest bit selected as negative, negative offset has same binary representation in 16 bits as large unsigned offset. (cherry picked from commit e9d12a6b451bd403d95105aa976a011dc821f126) --- .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 239 ++ llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 444 ++ 2 files changed, 683 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index a5e4151bf36958..47ca6f416b02b0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -1513,4 +1513,243 @@ bb: ret void } +define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { +; GFX9-LABEL: sgpr_base_large_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT:s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT:s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT:s_add_u32 s0, s2, 0xffe8 +; GFX9-NEXT:scratch_load_dword v2, off, s0 +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:global_store_dword v[0:1], v2, off +; GFX9-NEXT:s_endpgm +; +; GFX10-LABEL: sgpr_base_large_offset: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT:s_add_u32 s0, s0, s5 +; GFX10-NEXT:s_addc_u32 s1, s1, 0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT:s_add_u32 s0, s2, 0xffe8 +; GFX10-NEXT:scratch_load_dword v2, off, s0 +; GFX10-NEXT:s_waitcnt vmcnt(0) +; GFX10-NEXT:global_store_dword v[0:1], v2, off +; GFX10-NEXT:s_endpgm +; +; GFX940-LABEL: sgpr_base_large_offset: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT:s_add_u32 s0, s0, 0xffe8 +; GFX940-NEXT:scratch_load_dword v2, off, s0 +; GFX940-NEXT:s_waitcnt vmcnt(0) +; GFX940-NEXT:global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT:s_endpgm +; +; GFX11-LABEL: sgpr_base_large_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT:s_add_u32 s0, s0, 0xffe8 +; GFX11-NEXT:scratch_load_b32 v2, off, s0 +; GFX11-NEXT:s_waitcnt vmcnt(0) +; GFX11-NEXT:global_store_b32 v[0:1], v2, off +; GFX11-NEXT:s_nop 0 +; GFX11-NEXT:s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT:s_endpgm +; +; GFX12-LABEL: sgpr_base_large_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT:scratch_load_b32 v2, off, s0 offset:65512 +; GFX12-NEXT:s_wait_loadcnt 0x0 +; GFX12-NEXT:global_store_b32 v[0:1], v2, off +; GFX12-NEXT:s_nop 0 +; GFX12-NEXT:s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT:s_endpgm +entry: + %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512 + %load = load i32, ptr addrspace(5) %large_offset, align 4 + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { +; GFX9-LABEL: sgpr_base_large_offset_split: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT:s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT:s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT:s_and_b32 s0, s2, -4 +; GFX9-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX9-NEXT:scratch_load_dword v2, off, s0 glc +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:global_store_dword v[0:1], v2, off +; GFX9-NEXT:s_endpgm +; +; GFX10-LABEL: sgpr_base_large_offset_split: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT:s_add_u32 s0, s0, s5 +; GFX10-NEXT:s_addc_u32 s1, s1, 0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT:s_and_b32 s0, s2, -4 +; GFX10-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX10-NEXT:scratch_load_dword v2, off, s0 glc dlc +; GFX10-NEXT:s_waitcnt vmcnt(0) +; GFX10-NEXT:global_store_dword v[0:1], v2, off +; GFX10-NEXT:s_endpgm +; +; GFX940-LABEL: sgpr_base_large_offset_split: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT:s_and_b32 s0, s0, -4 +; GFX940-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX940-NEXT:scratch_load_dword v2, off, s0 sc0 sc1 +; GFX940-NEXT:s_waitcnt vmcnt(0) +; GFX940-NEXT:global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT:s_endpgm +; +; GFX11-LABEL: sgpr_base_large_offset_split: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT:s_and_b32 s0, s0, -4 +; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX11-NEXT:
[llvm-branch-commits] [llvm] release/19.x: AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (#110256) (PR #110470)
llvmbot wrote: @llvm/pr-subscribers-llvm-globalisel Author: None (llvmbot) Changes Backport e9d12a6b451bd403d95105aa976a011dc821f126 83fe85115da9dc25fa270d2ea8140113c8d49670 Requested by: @arsenm --- Patch is 29.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110470.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+3-3) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+239) - (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+444) ``diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b7471bab128509..7b786ee2641721 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1911,7 +1911,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, 0); } - Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32); return true; } @@ -1967,7 +1967,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) return false; -Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); +Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); return true; } } @@ -2000,7 +2000,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) return false; SAddr = SelectSAddrFI(CurDAG, SAddr); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index a5e4151bf36958..47ca6f416b02b0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -1513,4 +1513,243 @@ bb: ret void } +define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { +; GFX9-LABEL: sgpr_base_large_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT:s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT:s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT:s_add_u32 s0, s2, 0xffe8 +; GFX9-NEXT:scratch_load_dword v2, off, s0 +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:global_store_dword v[0:1], v2, off +; GFX9-NEXT:s_endpgm +; +; GFX10-LABEL: sgpr_base_large_offset: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT:s_add_u32 s0, s0, s5 +; GFX10-NEXT:s_addc_u32 s1, s1, 0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT:s_add_u32 s0, s2, 0xffe8 +; GFX10-NEXT:scratch_load_dword v2, off, s0 +; GFX10-NEXT:s_waitcnt vmcnt(0) +; GFX10-NEXT:global_store_dword v[0:1], v2, off +; GFX10-NEXT:s_endpgm +; +; GFX940-LABEL: sgpr_base_large_offset: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT:s_add_u32 s0, s0, 0xffe8 +; GFX940-NEXT:scratch_load_dword v2, off, s0 +; GFX940-NEXT:s_waitcnt vmcnt(0) +; GFX940-NEXT:global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT:s_endpgm +; +; GFX11-LABEL: sgpr_base_large_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT:s_add_u32 s0, s0, 0xffe8 +; GFX11-NEXT:scratch_load_b32 v2, off, s0 +; GFX11-NEXT:s_waitcnt vmcnt(0) +; GFX11-NEXT:global_store_b32 v[0:1], v2, off +; GFX11-NEXT:s_nop 0 +; GFX11-NEXT:s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT:s_endpgm +; +; GFX12-LABEL: sgpr_base_large_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT:scratch_load_b32 v2, off, s0 offset:65512 +; GFX12-NEXT:s_wait_loadcnt 0x0 +; GFX12-NEXT:global_store_b32 v[0:1], v2, off +; GFX12-NEXT:s_nop 0 +; GFX12-NEXT:s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT:s_endpgm +entry: + %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512 + %load = load i32, ptr addrspace(5) %large_offset, align 4 + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { +; GFX9-LABEL: sgpr_base_large_offset_split: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT:s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT:s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT:s_and_b32 s0, s2, -4 +; GFX9-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX9-NEXT:scratch_load_dword v2, off, s0 glc +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:global_store_dword v[0:1], v2, off +; GFX9-NEXT:s_endpgm +; +; GFX10-LABEL: sgpr_base_large_offset_split: +; GFX10:
[llvm-branch-commits] [llvm] release/19.x: AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (#110256) (PR #110470)
llvmbot wrote: @jayfoad @arsenm What do you think about merging this PR to the release branch? https://github.com/llvm/llvm-project/pull/110470 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (#110256) (PR #110470)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/110470 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Serialize WWM_REG vreg flag (PR #110229)
@@ -684,8 +684,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, void setFlag(Register Reg, uint8_t Flag) { assert(Reg.isVirtual()); -if (VRegFlags.inBounds(Reg)) - VRegFlags[Reg] |= Flag; +VRegFlags.grow(Reg); Akshat-Oke wrote: The MIR function is parsed after parsing the options, so the `noteNewVirtualRegister` callback doesn't take effect. https://github.com/llvm/llvm-project/pull/110229 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (#110256) (PR #110470)
https://github.com/jayfoad approved this pull request. https://github.com/llvm/llvm-project/pull/110470 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (PR #110256)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/110256 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] 00e4f81 - Revert "[flang] Implement GETUID and GETGID intrinsics (#108017)"
Author: David Truby Date: 2024-09-30T17:26:07+01:00 New Revision: 00e4f81a294e8e2d78d572c76dd017a8af050cf0 URL: https://github.com/llvm/llvm-project/commit/00e4f81a294e8e2d78d572c76dd017a8af050cf0 DIFF: https://github.com/llvm/llvm-project/commit/00e4f81a294e8e2d78d572c76dd017a8af050cf0.diff LOG: Revert "[flang] Implement GETUID and GETGID intrinsics (#108017)" This reverts commit 054eadcb117ba7c86a99dff5c9d0ed101c7f17ea. Added: Modified: flang/docs/Intrinsics.md flang/include/flang/Evaluate/target.h flang/include/flang/Optimizer/Builder/IntrinsicCall.h flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h flang/include/flang/Runtime/extensions.h flang/include/flang/Tools/TargetSetup.h flang/lib/Evaluate/intrinsics.cpp flang/lib/Optimizer/Builder/IntrinsicCall.cpp flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp flang/lib/Semantics/check-call.cpp flang/lib/Semantics/check-call.h flang/lib/Semantics/expression.cpp flang/runtime/extensions.cpp flang/unittests/Optimizer/Builder/Runtime/CommandTest.cpp flang/unittests/Optimizer/CMakeLists.txt Removed: flang/test/Semantics/windows.f90 flang/unittests/Optimizer/Builder/Runtime/IntrinsicsTest.cpp diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index e288fdeec6cd22..87716731ead855 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -765,7 +765,7 @@ This phase currently supports all the intrinsic procedures listed above but the | Coarray intrinsic functions | COSHAPE | | Object characteristic inquiry functions | ALLOCATED, ASSOCIATED, EXTENDS_TYPE_OF, IS_CONTIGUOUS, PRESENT, RANK, SAME_TYPE, STORAGE_SIZE | | Type inquiry intrinsic functions | BIT_SIZE, DIGITS, EPSILON, HUGE, KIND, MAXEXPONENT, MINEXPONENT, NEW_LINE, PRECISION, RADIX, RANGE, TINY| -| Non-standard intrinsic functions | AND, OR, XOR, SHIFT, ZEXT, IZEXT, COSD, SIND, TAND, ACOSD, ASIND, ATAND, ATAN2D, COMPL, EQV, NEQV, INT8, JINT, JNINT, KNINT, QCMPLX, DREAL, DFLOAT, QEXT, QFLOAT, QREAL, DNUM, NUM, JNUM, KNUM, QNUM, RNUM, RAN, RANF, ILEN, SIZEOF, MCLOCK, SECNDS, COTAN, IBCHNG, ISHA, ISHC, ISHL, IXOR, IARG, IARGC, NARGS, GETPID, NUMARG, BADDRESS, IADDR, CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, MALLOC, GETUID, GETGID | +| Non-standard intrinsic functions | AND, OR, XOR, SHIFT, ZEXT, IZEXT, COSD, SIND, TAND, ACOSD, ASIND, ATAND, ATAN2D, COMPL, EQV, NEQV, INT8, JINT, JNINT, KNINT, QCMPLX, DREAL, DFLOAT, QEXT, QFLOAT, QREAL, DNUM, NUM, JNUM, KNUM, QNUM, RNUM, RAN, RANF, ILEN, SIZEOF, MCLOCK, SECNDS, COTAN, IBCHNG, ISHA, ISHC, ISHL, IXOR, IARG, IARGC, NARGS, GETPID, NUMARG, BADDRESS, IADDR, CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, MALLOC | | Intrinsic subroutines |MVBITS (elemental), CPU_TIME, DATE_AND_TIME, EVENT_QUERY, EXECUTE_COMMAND_LINE, GET_COMMAND, GET_COMMAND_ARGUMENT, GET_ENVIRONMENT_VARIABLE, MOVE_ALLOC, RANDOM_INIT, RANDOM_NUMBER, RANDOM_SEED, SIGNAL, SLEEP, SYSTEM, SYSTEM_CLOCK | | Atomic intrinsic subroutines | ATOMIC_ADD | | Collective intrinsic subroutines | CO_REDUCE | diff --git a/flang/include/flang/Evaluate/target.h b/flang/include/flang/Evaluate/target.h index b347c549e012da..d076fcbf083078 100644 --- a/flang/include/flang/Evaluate/target.h +++ b/flang/include/flang/Evaluate/target.h @@ -102,11 +102,6 @@ class TargetCharacteristics { bool isPPC() const { return isPPC_; } void set_isPPC(bool isPPC = false); - bool isOSWindows() const { return isOSWindows_; } - void set_isOSWindows(bool isOSWindows = false) { -isOSWindows_ = isOSWindows; - }; - IeeeFeatures &ieeeFeatures() { return ieeeFeatures_; } const IeeeFeatures &ieeeFeatures() const { return ieeeFeatures_; } @@ -116,7 +111,6 @@ class TargetCharacteristics { std::uint8_t align_[common::TypeCategory_enumSize][maxKind]{}; bool isBigEndian_{false}; bool isPPC_{false}; - bool isOSWindows_{false}; bool areSubnormalsFlushedToZero_{false}; Rounding roundingMode_{defaultRounding}; std::size_t procedurePointerByteSize_{8}; diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index b2da6138fc9d8e..78bb82b17d4050 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -256,10 +256,6 @@ struct IntrinsicLibrary { llvm::ArrayRef args); void genGetCommandArgument(mlir::ArrayRef args); void genGetEnvironmentVariable(llvm::ArrayRef); - mlir::Value genGetGID(mlir::Type resultType, -llvm::ArrayRef args); - mlir::Value genGetUID(mlir::Type resultType, -llvm::ArrayRef args); fir::ExtendedValue genIall(mlir::Type, llvm::ArrayRef); mlir::Value genIand(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genIany(mlir::Type, llvm::ArrayRef); diff
[llvm-branch-commits] [flang] [mlir] [MLIR][OpenMP] Use map format to represent use_device_{addr, ptr} (PR #109810)
https://github.com/agozillon approved this pull request. LGTM, @TIFitis would be a good secondary reviewer if he wishes to do so! https://github.com/llvm/llvm-project/pull/109810 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [mlir] [MLIR][OpenMP] Use map format to represent use_device_{addr, ptr} (PR #109810)
https://github.com/agozillon edited https://github.com/llvm/llvm-project/pull/109810 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Improve omp.section block arguments handling (PR #110266)
https://github.com/mjklemm approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/110266 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [mlir] [MLIR][OpenMP] Use map format to represent use_device_{addr, ptr} (PR #109810)
https://github.com/skatrak updated https://github.com/llvm/llvm-project/pull/109810 >From f61e3a60d6f494d08b58ded9b802f2b3d92b728f Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Fri, 20 Sep 2024 17:11:34 +0100 Subject: [PATCH] [MLIR][OpenMP] Use map format to represent use_device_{addr,ptr} This patch updates the `omp.target_data` operation to use the same formatting as `map` clauses on `omp.target` for `use_device_addr` and `use_device_ptr`. This is done so the mapping that is being enforced between op arguments and associated entry block arguments is explicit. The way it is achieved is by marking these clauses as entry block argument-defining and adjusting printer/parsers accordingly. As a result of this change, block arguments for `use_device_addr` come before those for `use_device_ptr`, which is the opposite of the previous undocumented situation. Some unit tests are updated based on this change, in addition to those updated because of the format change. --- .../Fir/convert-to-llvm-openmp-and-fir.fir| 5 +- flang/test/Lower/OpenMP/target.f90| 6 +- .../use-device-ptr-to-use-device-addr.f90 | 12 +-- .../mlir/Dialect/OpenMP/OpenMPClauses.td | 28 ++- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 6 ++ .../Dialect/OpenMP/OpenMPOpsInterfaces.td | 37 - mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 43 +++ .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 77 --- mlir/test/Dialect/OpenMP/ops.mlir | 6 +- mlir/test/Target/LLVMIR/omptarget-llvm.mlir | 19 ++--- .../openmp-target-use-device-nested.mlir | 3 +- 11 files changed, 179 insertions(+), 63 deletions(-) diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir index 4d226eaa754c12..61f18008633d50 100644 --- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir +++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir @@ -429,13 +429,14 @@ func.func @_QPopenmp_target_data_region() { func.func @_QPomp_target_data_empty() { %0 = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_data_emptyEa"} - omp.target_data use_device_addr(%0 : !fir.ref>) { + omp.target_data use_device_addr(%0 -> %arg0 : !fir.ref>) { +omp.terminator } return } // CHECK-LABEL: llvm.func @_QPomp_target_data_empty -// CHECK: omp.target_data use_device_addr(%1 : !llvm.ptr) { +// CHECK: omp.target_data use_device_addr(%1 -> %{{.*}} : !llvm.ptr) { // CHECK: } // - diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90 index dedce581436490..ab33b6b3808315 100644 --- a/flang/test/Lower/OpenMP/target.f90 +++ b/flang/test/Lower/OpenMP/target.f90 @@ -506,9 +506,8 @@ subroutine omp_target_device_ptr type(c_ptr) :: a integer, target :: b !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(tofrom) capture(ByRef) -> {{.*}} {name = "a"} - !CHECK: omp.target_data map_entries(%[[MAP]]{{.*}}) use_device_ptr({{.*}}) + !CHECK: omp.target_data map_entries(%[[MAP]]{{.*}}) use_device_ptr({{.*}} -> %[[VAL_1:.*]] : !fir.ref>) !$omp target data map(tofrom: a) use_device_ptr(a) - !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref>): !CHECK: {{.*}} = fir.coordinate_of %[[VAL_1:.*]], {{.*}} : (!fir.ref>, !fir.field) -> !fir.ref a = c_loc(b) !CHECK: omp.terminator @@ -529,9 +528,8 @@ subroutine omp_target_device_addr !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, !fir.box>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBERS]] : [0] : !fir.llvm_ptr>) -> !fir.ref>> {name = "a"} !CHECK: %[[DEV_ADDR_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, i32) var_ptr_ptr({{.*}} : !fir.llvm_ptr>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr> {name = ""} !CHECK: %[[DEV_ADDR:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, !fir.box>) map_clauses(tofrom) capture(ByRef) members(%[[DEV_ADDR_MEMBERS]] : [0] : !fir.llvm_ptr>) -> !fir.ref>> {name = "a"} - !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[DEV_ADDR_MEMBERS]], %[[DEV_ADDR]] : {{.*}}) { + !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[DEV_ADDR_MEMBERS]] -> %[[ARG_0:.*]], %[[DEV_ADDR]] -> %[[ARG_1:.*]] : !fir.llvm_ptr>, !fir.ref>>) { !$omp target data map(tofrom: a) use_device_addr(a) - !CHECK: ^bb0(%[[ARG_0:.*]]: !fir.llvm_ptr>, %[[ARG_1:.*]]: !fir.ref>>): !CHECK: %[[VAL_1_DECL:.*]]:2 = hlfir.declare %[[ARG_1]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) !CHECK: %[[C10:.*]] = arith.constant 10 : i32 !CHECK: %[[A_BOX:.*]] = fir.load %[[VAL_1_DECL]]#0 : !fir.ref>> diff --git a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 index 085f5419fa7f88..cb26246a6
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Improve omp.section block arguments handling (PR #110266)
https://github.com/skatrak updated https://github.com/llvm/llvm-project/pull/110266 >From d6920f4bd10cdf88d6d640f8e1da2c595c39bdb6 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Thu, 26 Sep 2024 11:42:03 +0100 Subject: [PATCH] [MLIR][OpenMP] Improve omp.section block arguments handling The `omp.section` operation is an outlier in that the block arguments it has are defined by clauses on the required parent `omp.sections` operation. This patch updates the definition of this operation introducing the `BlockArgOpenMPOpInterface` to simplify the handling and verification of these block arguments, implemented based on the parent `omp.sections`. --- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 12 +++-- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 12 + mlir/test/Dialect/OpenMP/invalid.mlir | 25 +++ mlir/test/Dialect/OpenMP/ops.mlir | 6 + 4 files changed, 53 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index d2a2b44c042fb7..66f63fc02fe2f3 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -207,8 +207,9 @@ def TeamsOp : OpenMP_Op<"teams", traits = [ // 2.8.1 Sections Construct //===--===// -def SectionOp : OpenMP_Op<"section", [HasParent<"SectionsOp">], - singleRegion = true> { +def SectionOp : OpenMP_Op<"section", traits = [ +BlockArgOpenMPOpInterface, HasParent<"SectionsOp"> + ], singleRegion = true> { let summary = "section directive"; let description = [{ A section operation encloses a region which represents one section in a @@ -218,6 +219,13 @@ def SectionOp : OpenMP_Op<"section", [HasParent<"SectionsOp">], operation. This is done to reflect situations where these block arguments represent variables private to each section. }]; + let extraClassDeclaration = [{ +// Override BlockArgOpenMPOpInterface methods based on the parent +// omp.sections operation. Only forward-declare here because SectionsOp is +// not completely defined at this point. +unsigned numPrivateBlockArgs(); +unsigned numReductionBlockArgs(); + }] # clausesExtraClassDeclaration; let assemblyFormat = "$region attr-dict"; } diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 220eb848ab4de2..928a07580b2637 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1844,6 +1844,18 @@ LogicalResult TeamsOp::verify() { getReductionByref()); } +//===--===// +// SectionOp +//===--===// + +unsigned SectionOp::numPrivateBlockArgs() { + return getParentOp().numPrivateBlockArgs(); +} + +unsigned SectionOp::numReductionBlockArgs() { + return getParentOp().numReductionBlockArgs(); +} + //===--===// // SectionsOp //===--===// diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index 2e4df7422e4a49..a228b6430560ea 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -1572,6 +1572,31 @@ func.func @omp_sections() { // - +omp.declare_reduction @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = arith.constant 0.0 : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = arith.addf %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} + +func.func @omp_sections(%x : !llvm.ptr) { + omp.sections reduction(@add_f32 %x -> %arg0 : !llvm.ptr) { +// expected-error @below {{op expected at least 1 entry block argument(s)}} +omp.section { + omp.terminator +} +omp.terminator + } + return +} + +// - + func.func @omp_single(%data_var : memref) -> () { // expected-error @below {{expected equal sizes for allocate and allocator variables}} "omp.single" (%data_var) ({ diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index ce3351ba1149f3..a4423782a723bf 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -1127,11 +1127,13 @@ func.func @sections_reduction() { omp.sections reduction(@add_f32 %0 -> %arg0 : !llvm.ptr) { // CHECK: omp.section omp.section { +^bb0(%arg1 : !llvm.ptr): %1 = arith.constant 2.0 : f32 omp.terminator } // CHECK: omp.section omp.section { +^bb0(%arg1 : !llvm.ptr): %1 = arith.constant 3.0 : f32 omp.terminator } @@ -1148,11 +1150,13 @@ func.func @sections_reduction_byref() { omp
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Document entry block argument-defining clauses (NFC) (PR #109811)
https://github.com/skatrak updated https://github.com/llvm/llvm-project/pull/109811 >From a821f44e2c9ac732c752abae62385c4d78082a2b Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Tue, 24 Sep 2024 15:40:17 +0100 Subject: [PATCH] [MLIR][OpenMP] Document entry block argument-defining clauses (NFC) This patch adds general information on the proposed approach to unify the handling and representation of clauses that define entry block arguments attached to operations that accept them. --- mlir/docs/Dialects/OpenMPDialect/_index.md | 70 +- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/mlir/docs/Dialects/OpenMPDialect/_index.md b/mlir/docs/Dialects/OpenMPDialect/_index.md index 88437b8cf828cc..3c30b29d09356b 100644 --- a/mlir/docs/Dialects/OpenMPDialect/_index.md +++ b/mlir/docs/Dialects/OpenMPDialect/_index.md @@ -285,7 +285,75 @@ argument's type: specific `mlir::Attribute` subclass) will be used instead. - Other attribute types will be represented with their `storageType`. - It will create `Operands` structure for each operation, which is an -empty structure subclassing all operand structures defined for the corresponding `OpenMP_Op`'s clauses. +empty structure subclassing all operand structures defined for the corresponding +`OpenMP_Op`'s clauses. + +### Entry Block Argument-Defining Clauses + +Certain OpenMP clauses introduce in their MLIR representation mappings between +outside values and entry block arguments for the region of the MLIR operation +they are applied to. This enables, for example, the introduction of private +copies of the same underlying variable. Currently, clauses with this property +can be classified in three main categories: + - Map-like clauses: `map`, `use_device_addr` and `use_device_ptr`. + - Reduction-like clauses: `in_reduction`, `reduction` and `task_reduction`. + - Privatization clause: `private`. + +All three kinds of entry block argument-defining clauses use a similar custom +assembly format representation, only differing based on the different pieces of +information attached to each kind. Below, one example of each is shown: + +```mlir +omp.target map_entries(%x -> %x.m, %y -> %y.m : !llvm.ptr, !llvm.ptr) { + // Use %x.m, %y.m in place of %x and %y... +} + +omp.wsloop reduction(@add.i32 %x -> %x.r, byref @add.f32 %y -> %y.r : !llvm.ptr, !llvm.ptr) { + // Use %x.r, %y.r in place of %x and %y... +} + +omp.parallel private(@x.privatizer %x -> %x.p, @y.privatizer %y -> %y.p : !llvm.ptr, !llvm.ptr) { + // Use %x.p, %y.p in place of %x and %y... +} +``` + +As a consequence of parsing and printing the operation's first region entry +block argument names together with the custom assembly format of these clauses, +entry block arguments (i.e. the `^bb0(...):` line) must not be explicitly +defined for these operations. Additionally, it is not possible to implement this +feature while allowing each clause to be independently parsed and printed, +because they need to be printed/parsed together with the corresponding +operation's first region. They must have a well-defined ordering in which +multiple of these clauses are specified for a given operation, as well. + +The parsing/printing of these clauses together with the region provides the +ability to define entry block arguments directly after the `->`. Forcing a +specific ordering between these clauses makes the block argument ordering +well-defined, which is the property used to easily match each clause with the +entry block arguments defined by it. + +Custom printers and parsers for operation regions based on the entry block +argument-defining clauses they take are implemented based on the +`{parse,print}BlockArgRegion` functions, which take care of the sorting and +formatting of each kind of clause, minimizing code duplication resulting from +this approach. One example of the custom assembly format of an operation taking +the `private` and `reduction` clauses is the following: + +```tablegen +let assemblyFormat = clausesAssemblyFormat # [{ + custom($region, $private_vars, type($private_vars), + $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref, + $reduction_syms) attr-dict +}]; +``` + +The `BlockArgOpenMPOpInterface` has been introduced to simplify the addition and +handling of these kinds of clauses. It holds `numBlockArgs()` +functions that by default return 0, to be overriden by each clause through the +`extraClassDeclaration` property. Based on these functions and the expected +alphabetical sorting between entry block argument-defining clauses, it +implements `getBlockArgs()` functions that are the intended method +of accessing clause-defined block arguments. ## Loop-Associated Directives ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: FastISel: Fix incorrectly using getPointerTy (#110465) (PR #110490)
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/110490 Backport 81ba95cefe1b5a12f0a7d8e6a383bcce9e95b785 Requested by: @arsenm >From 2c0b211043d4516fa33c1a87c0e239f4de58b4fc Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 30 Sep 2024 13:43:53 +0400 Subject: [PATCH] FastISel: Fix incorrectly using getPointerTy (#110465) This was using the default address space instead of the correct one. Fixes #56055 (cherry picked from commit 81ba95cefe1b5a12f0a7d8e6a383bcce9e95b785) --- llvm/include/llvm/CodeGen/FastISel.h | 2 +- llvm/lib/CodeGen/SelectionDAG/FastISel.cpp | 8 +-- llvm/lib/Target/X86/X86FastISel.cpp| 4 +- llvm/test/CodeGen/X86/issue56055.ll| 81 ++ 4 files changed, 89 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/X86/issue56055.ll diff --git a/llvm/include/llvm/CodeGen/FastISel.h b/llvm/include/llvm/CodeGen/FastISel.h index 3cbc35400181dd..f3c4cc8d0511d4 100644 --- a/llvm/include/llvm/CodeGen/FastISel.h +++ b/llvm/include/llvm/CodeGen/FastISel.h @@ -275,7 +275,7 @@ class FastISel { /// This is a wrapper around getRegForValue that also takes care of /// truncating or sign-extending the given getelementptr index value. - Register getRegForGEPIndex(const Value *Idx); + Register getRegForGEPIndex(MVT PtrVT, const Value *Idx); /// We're checking to see if we can fold \p LI into \p FoldInst. Note /// that we could have a sequence where multiple LLVM IR instructions are diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index ef9f7833551905..246acc7f405837 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -380,14 +380,13 @@ void FastISel::updateValueMap(const Value *I, Register Reg, unsigned NumRegs) { } } -Register FastISel::getRegForGEPIndex(const Value *Idx) { +Register FastISel::getRegForGEPIndex(MVT PtrVT, const Value *Idx) { Register IdxN = getRegForValue(Idx); if (!IdxN) // Unhandled operand. Halt "fast" selection and bail. return Register(); // If the index is smaller or larger than intptr_t, truncate or extend it. - MVT PtrVT = TLI.getPointerTy(DL); EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false); if (IdxVT.bitsLT(PtrVT)) { IdxN = fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND, IdxN); @@ -543,7 +542,8 @@ bool FastISel::selectGetElementPtr(const User *I) { uint64_t TotalOffs = 0; // FIXME: What's a good SWAG number for MaxOffs? uint64_t MaxOffs = 2048; - MVT VT = TLI.getPointerTy(DL); + MVT VT = TLI.getValueType(DL, I->getType()).getSimpleVT(); + for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I); GTI != E; ++GTI) { const Value *Idx = GTI.getOperand(); @@ -584,7 +584,7 @@ bool FastISel::selectGetElementPtr(const User *I) { // N = N + Idx * ElementSize; uint64_t ElementSize = GTI.getSequentialElementStride(DL); - Register IdxN = getRegForGEPIndex(Idx); + Register IdxN = getRegForGEPIndex(VT, Idx); if (!IdxN) // Unhandled operand. Halt "fast" selection and bail. return false; diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 2eae155956368f..5d594bd54fbfc4 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -902,6 +902,8 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { uint64_t Disp = (int32_t)AM.Disp; unsigned IndexReg = AM.IndexReg; unsigned Scale = AM.Scale; +MVT PtrVT = TLI.getValueType(DL, U->getType()).getSimpleVT(); + gep_type_iterator GTI = gep_type_begin(U); // Iterate through the indices, folding what we can. Constants can be // folded, and one dynamic index can be handled, if the scale is supported. @@ -937,7 +939,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { (S == 1 || S == 2 || S == 4 || S == 8)) { // Scaled-index addressing. Scale = S; - IndexReg = getRegForGEPIndex(Op); + IndexReg = getRegForGEPIndex(PtrVT, Op); if (IndexReg == 0) return false; break; diff --git a/llvm/test/CodeGen/X86/issue56055.ll b/llvm/test/CodeGen/X86/issue56055.ll new file mode 100644 index 00..27eaf13e3b00be --- /dev/null +++ b/llvm/test/CodeGen/X86/issue56055.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -fast-isel < %s | FileCheck -check-prefixes=CHECK,FASTISEL %s +; RUN: llc < %s | FileCheck -check-prefixes=CHECK,SDAG %s + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-windows-msvc" + +define void @issue56055(ptr addrspace(270) %ptr, ptr %out) { +; CHECK-LABEL: issue56055: +; CHECK
[llvm-branch-commits] [llvm] release/19.x: FastISel: Fix incorrectly using getPointerTy (#110465) (PR #110490)
https://github.com/llvmbot milestoned https://github.com/llvm/llvm-project/pull/110490 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: FastISel: Fix incorrectly using getPointerTy (#110465) (PR #110490)
llvmbot wrote: @llvm/pr-subscribers-backend-x86 Author: None (llvmbot) Changes Backport 81ba95cefe1b5a12f0a7d8e6a383bcce9e95b785 Requested by: @arsenm --- Full diff: https://github.com/llvm/llvm-project/pull/110490.diff 4 Files Affected: - (modified) llvm/include/llvm/CodeGen/FastISel.h (+1-1) - (modified) llvm/lib/CodeGen/SelectionDAG/FastISel.cpp (+4-4) - (modified) llvm/lib/Target/X86/X86FastISel.cpp (+3-1) - (added) llvm/test/CodeGen/X86/issue56055.ll (+81) ``diff diff --git a/llvm/include/llvm/CodeGen/FastISel.h b/llvm/include/llvm/CodeGen/FastISel.h index 3cbc35400181dd..f3c4cc8d0511d4 100644 --- a/llvm/include/llvm/CodeGen/FastISel.h +++ b/llvm/include/llvm/CodeGen/FastISel.h @@ -275,7 +275,7 @@ class FastISel { /// This is a wrapper around getRegForValue that also takes care of /// truncating or sign-extending the given getelementptr index value. - Register getRegForGEPIndex(const Value *Idx); + Register getRegForGEPIndex(MVT PtrVT, const Value *Idx); /// We're checking to see if we can fold \p LI into \p FoldInst. Note /// that we could have a sequence where multiple LLVM IR instructions are diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index ef9f7833551905..246acc7f405837 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -380,14 +380,13 @@ void FastISel::updateValueMap(const Value *I, Register Reg, unsigned NumRegs) { } } -Register FastISel::getRegForGEPIndex(const Value *Idx) { +Register FastISel::getRegForGEPIndex(MVT PtrVT, const Value *Idx) { Register IdxN = getRegForValue(Idx); if (!IdxN) // Unhandled operand. Halt "fast" selection and bail. return Register(); // If the index is smaller or larger than intptr_t, truncate or extend it. - MVT PtrVT = TLI.getPointerTy(DL); EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false); if (IdxVT.bitsLT(PtrVT)) { IdxN = fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND, IdxN); @@ -543,7 +542,8 @@ bool FastISel::selectGetElementPtr(const User *I) { uint64_t TotalOffs = 0; // FIXME: What's a good SWAG number for MaxOffs? uint64_t MaxOffs = 2048; - MVT VT = TLI.getPointerTy(DL); + MVT VT = TLI.getValueType(DL, I->getType()).getSimpleVT(); + for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I); GTI != E; ++GTI) { const Value *Idx = GTI.getOperand(); @@ -584,7 +584,7 @@ bool FastISel::selectGetElementPtr(const User *I) { // N = N + Idx * ElementSize; uint64_t ElementSize = GTI.getSequentialElementStride(DL); - Register IdxN = getRegForGEPIndex(Idx); + Register IdxN = getRegForGEPIndex(VT, Idx); if (!IdxN) // Unhandled operand. Halt "fast" selection and bail. return false; diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 2eae155956368f..5d594bd54fbfc4 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -902,6 +902,8 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { uint64_t Disp = (int32_t)AM.Disp; unsigned IndexReg = AM.IndexReg; unsigned Scale = AM.Scale; +MVT PtrVT = TLI.getValueType(DL, U->getType()).getSimpleVT(); + gep_type_iterator GTI = gep_type_begin(U); // Iterate through the indices, folding what we can. Constants can be // folded, and one dynamic index can be handled, if the scale is supported. @@ -937,7 +939,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { (S == 1 || S == 2 || S == 4 || S == 8)) { // Scaled-index addressing. Scale = S; - IndexReg = getRegForGEPIndex(Op); + IndexReg = getRegForGEPIndex(PtrVT, Op); if (IndexReg == 0) return false; break; diff --git a/llvm/test/CodeGen/X86/issue56055.ll b/llvm/test/CodeGen/X86/issue56055.ll new file mode 100644 index 00..27eaf13e3b00be --- /dev/null +++ b/llvm/test/CodeGen/X86/issue56055.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -fast-isel < %s | FileCheck -check-prefixes=CHECK,FASTISEL %s +; RUN: llc < %s | FileCheck -check-prefixes=CHECK,SDAG %s + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-windows-msvc" + +define void @issue56055(ptr addrspace(270) %ptr, ptr %out) { +; CHECK-LABEL: issue56055: +; CHECK: # %bb.0: +; CHECK-NEXT:addl $2, %ecx +; CHECK-NEXT:movl %ecx, (%rdx) +; CHECK-NEXT:retq + %add.ptr = getelementptr inbounds i8, ptr addrspace(270) %ptr, i32 2 + store ptr addrspace(270) %add.ptr, ptr %out + ret void +} + +define void @issue56055_vector(<2 x ptr addrspace(270)> %ptr, ptr %out) { +; CHECK-LABEL: issue56055_vector: +; CH
[llvm-branch-commits] [llvm] release/19.x: FastISel: Fix incorrectly using getPointerTy (#110465) (PR #110490)
llvmbot wrote: @llvm/pr-subscribers-llvm-selectiondag Author: None (llvmbot) Changes Backport 81ba95cefe1b5a12f0a7d8e6a383bcce9e95b785 Requested by: @arsenm --- Full diff: https://github.com/llvm/llvm-project/pull/110490.diff 4 Files Affected: - (modified) llvm/include/llvm/CodeGen/FastISel.h (+1-1) - (modified) llvm/lib/CodeGen/SelectionDAG/FastISel.cpp (+4-4) - (modified) llvm/lib/Target/X86/X86FastISel.cpp (+3-1) - (added) llvm/test/CodeGen/X86/issue56055.ll (+81) ``diff diff --git a/llvm/include/llvm/CodeGen/FastISel.h b/llvm/include/llvm/CodeGen/FastISel.h index 3cbc35400181dd..f3c4cc8d0511d4 100644 --- a/llvm/include/llvm/CodeGen/FastISel.h +++ b/llvm/include/llvm/CodeGen/FastISel.h @@ -275,7 +275,7 @@ class FastISel { /// This is a wrapper around getRegForValue that also takes care of /// truncating or sign-extending the given getelementptr index value. - Register getRegForGEPIndex(const Value *Idx); + Register getRegForGEPIndex(MVT PtrVT, const Value *Idx); /// We're checking to see if we can fold \p LI into \p FoldInst. Note /// that we could have a sequence where multiple LLVM IR instructions are diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index ef9f7833551905..246acc7f405837 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -380,14 +380,13 @@ void FastISel::updateValueMap(const Value *I, Register Reg, unsigned NumRegs) { } } -Register FastISel::getRegForGEPIndex(const Value *Idx) { +Register FastISel::getRegForGEPIndex(MVT PtrVT, const Value *Idx) { Register IdxN = getRegForValue(Idx); if (!IdxN) // Unhandled operand. Halt "fast" selection and bail. return Register(); // If the index is smaller or larger than intptr_t, truncate or extend it. - MVT PtrVT = TLI.getPointerTy(DL); EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false); if (IdxVT.bitsLT(PtrVT)) { IdxN = fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND, IdxN); @@ -543,7 +542,8 @@ bool FastISel::selectGetElementPtr(const User *I) { uint64_t TotalOffs = 0; // FIXME: What's a good SWAG number for MaxOffs? uint64_t MaxOffs = 2048; - MVT VT = TLI.getPointerTy(DL); + MVT VT = TLI.getValueType(DL, I->getType()).getSimpleVT(); + for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I); GTI != E; ++GTI) { const Value *Idx = GTI.getOperand(); @@ -584,7 +584,7 @@ bool FastISel::selectGetElementPtr(const User *I) { // N = N + Idx * ElementSize; uint64_t ElementSize = GTI.getSequentialElementStride(DL); - Register IdxN = getRegForGEPIndex(Idx); + Register IdxN = getRegForGEPIndex(VT, Idx); if (!IdxN) // Unhandled operand. Halt "fast" selection and bail. return false; diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 2eae155956368f..5d594bd54fbfc4 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -902,6 +902,8 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { uint64_t Disp = (int32_t)AM.Disp; unsigned IndexReg = AM.IndexReg; unsigned Scale = AM.Scale; +MVT PtrVT = TLI.getValueType(DL, U->getType()).getSimpleVT(); + gep_type_iterator GTI = gep_type_begin(U); // Iterate through the indices, folding what we can. Constants can be // folded, and one dynamic index can be handled, if the scale is supported. @@ -937,7 +939,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { (S == 1 || S == 2 || S == 4 || S == 8)) { // Scaled-index addressing. Scale = S; - IndexReg = getRegForGEPIndex(Op); + IndexReg = getRegForGEPIndex(PtrVT, Op); if (IndexReg == 0) return false; break; diff --git a/llvm/test/CodeGen/X86/issue56055.ll b/llvm/test/CodeGen/X86/issue56055.ll new file mode 100644 index 00..27eaf13e3b00be --- /dev/null +++ b/llvm/test/CodeGen/X86/issue56055.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -fast-isel < %s | FileCheck -check-prefixes=CHECK,FASTISEL %s +; RUN: llc < %s | FileCheck -check-prefixes=CHECK,SDAG %s + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-windows-msvc" + +define void @issue56055(ptr addrspace(270) %ptr, ptr %out) { +; CHECK-LABEL: issue56055: +; CHECK: # %bb.0: +; CHECK-NEXT:addl $2, %ecx +; CHECK-NEXT:movl %ecx, (%rdx) +; CHECK-NEXT:retq + %add.ptr = getelementptr inbounds i8, ptr addrspace(270) %ptr, i32 2 + store ptr addrspace(270) %add.ptr, ptr %out + ret void +} + +define void @issue56055_vector(<2 x ptr addrspace(270)> %ptr, ptr %out) { +; CHECK-LABEL: issue56055_vector:
[llvm-branch-commits] [llvm] release/19.x: FastISel: Fix incorrectly using getPointerTy (#110465) (PR #110490)
llvmbot wrote: @nikic What do you think about merging this PR to the release branch? https://github.com/llvm/llvm-project/pull/110490 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Adopt new lowering sequence for `fdiv16` (PR #109295)
@@ -10616,19 +10616,43 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { return FastLowered; SDLoc SL(Op); - SDValue Src0 = Op.getOperand(0); - SDValue Src1 = Op.getOperand(1); - - SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); - SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); - - SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1); - SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1); - - SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32); - SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); - return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0); + // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32 + // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32 + // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d + // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp + // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n + // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp + // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n + // tmp.u = opx(V_MUL_F32, e32.u, r32.u); + // tmp.u = opx(V_AND_B32, tmp.u, 0xff80) + // q32.u = opx(V_ADD_F32, tmp.u, q32.u); + // q16.u = opx(V_CVT_F16_F32, q32.u); + // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n) + + // We will use ISD::FMA on targets that don't support ISD::FMAD. + unsigned FMADOpCode = + isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA; + + SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS); + SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS); arsenm wrote: Didn't propagate flags https://github.com/llvm/llvm-project/pull/109295 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Adopt new lowering sequence for `fdiv16` (PR #109295)
@@ -4903,16 +4903,40 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, LLT S16 = LLT::scalar(16); LLT S32 = LLT::scalar(32); + // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32 + // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32 + // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d + // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp + // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n + // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp + // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n + // tmp.u = opx(V_MUL_F32, e32.u, r32.u); + // tmp.u = opx(V_AND_B32, tmp.u, 0xff80) + // q32.u = opx(V_ADD_F32, tmp.u, q32.u); + // q16.u = opx(V_CVT_F16_F32, q32.u); + // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n) + auto LHSExt = B.buildFPExt(S32, LHS, Flags); auto RHSExt = B.buildFPExt(S32, RHS, Flags); - - auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) + auto NegRHSExt = B.buildFNeg(S32, RHSExt); + auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) .addUse(RHSExt.getReg(0)) .setMIFlags(Flags); - - auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); - auto RDst = B.buildFPTrunc(S16, QUOT, Flags); - + auto Quot = B.buildFMul(S32, LHSExt, Rcp); arsenm wrote: Lost flags after this point https://github.com/llvm/llvm-project/pull/109295 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [mlir] [MLIR][OpenMP] Use map format to represent use_device_{addr, ptr} (PR #109810)
skatrak wrote: Ping for review! https://github.com/llvm/llvm-project/pull/109810 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Add tests for SIPreAllocateWWMRegs (PR #109963)
@@ -0,0 +1,26 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass=si-pre-allocate-wwm-regs -o - -mcpu=tahiti %s | FileCheck %s + +--- + +name: pre_allocate_wwm_regs_strict +tracksRegLiveness: true +body: | arsenm wrote: WWMSpills, SpillVGPRs, SpillPhysVGPRs, SGPRSpillsTo* https://github.com/llvm/llvm-project/pull/109963 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Generalize the instruction size checking in AsmPrinter (PR #110108)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/110108 >From 92eb911fcd781825fa88aaec6c05b9484f49d158 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Wed, 25 Sep 2024 16:16:29 +0300 Subject: [PATCH] [AArch64] Generalize the instruction size checking in AsmPrinter Most of PAuth-related code counts the instructions being inserted and asserts that no more bytes are emitted than the size returned by the getInstSizeInBytes(MI) method. This check seems useful not only for PAuth-related instructions. Also, reimplementing it globally in AArch64AsmPrinter makes it more robust and simplifies further refactoring of PAuth-related code. --- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 121 +++--- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 1 + 2 files changed, 44 insertions(+), 78 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 47dd32ad2adc2f..c6ee8d43bd8f2d 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -24,6 +24,7 @@ #include "MCTargetDesc/AArch64TargetStreamer.h" #include "TargetInfo/AArch64TargetInfo.h" #include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" @@ -86,6 +87,9 @@ class AArch64AsmPrinter : public AsmPrinter { FaultMaps FM; const AArch64Subtarget *STI; bool ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = false; +#ifndef NDEBUG + unsigned InstsEmitted; +#endif public: AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) @@ -150,8 +154,7 @@ class AArch64AsmPrinter : public AsmPrinter { void emitPtrauthAuthResign(const MachineInstr *MI); // Emit the sequence to compute a discriminator into x17, or reuse AddrDisc. - unsigned emitPtrauthDiscriminator(uint16_t Disc, unsigned AddrDisc, -unsigned &InstsEmitted); + unsigned emitPtrauthDiscriminator(uint16_t Disc, unsigned AddrDisc); // Emit the sequence for LOADauthptrstatic void LowerLOADauthptrstatic(const MachineInstr &MI); @@ -1338,8 +1341,6 @@ void AArch64AsmPrinter::LowerJumpTableDest(llvm::MCStreamer &OutStreamer, } void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) { - unsigned InstsEmitted = 0; - const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); assert(MJTI && "Can't lower jump-table dispatch without JTI"); @@ -1377,10 +1378,8 @@ void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) { .addReg(AArch64::X16) .addImm(MaxTableEntry) .addImm(0)); -++InstsEmitted; } else { emitMOVZ(AArch64::X17, static_cast(MaxTableEntry), 0); -++InstsEmitted; // It's sad that we have to manually materialize instructions, but we can't // trivially reuse the main pseudo expansion logic. // A MOVK sequence is easy enough to generate and handles the general case. @@ -1389,14 +1388,12 @@ void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) { break; emitMOVK(AArch64::X17, static_cast(MaxTableEntry >> Offset), Offset); - ++InstsEmitted; } EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::SUBSXrs) .addReg(AArch64::XZR) .addReg(AArch64::X16) .addReg(AArch64::X17) .addImm(0)); -++InstsEmitted; } // This picks entry #0 on failure. @@ -1406,7 +1403,6 @@ void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) { .addReg(AArch64::X16) .addReg(AArch64::XZR) .addImm(AArch64CC::LS)); - ++InstsEmitted; // Prepare the @PAGE/@PAGEOFF low/high operands. MachineOperand JTMOHi(JTOp), JTMOLo(JTOp); @@ -1421,14 +1417,12 @@ void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) { EmitToStreamer( *OutStreamer, MCInstBuilder(AArch64::ADRP).addReg(AArch64::X17).addOperand(JTMCHi)); - ++InstsEmitted; EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXri) .addReg(AArch64::X17) .addReg(AArch64::X17) .addOperand(JTMCLo) .addImm(0)); - ++InstsEmitted; EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::LDRSWroX) .addReg(AArch64::X16) @@ -1436,7 +1430,6 @@ void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) { .addReg(AArch64::X16)
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Improve omp.section block arguments handling (PR #110266)
https://github.com/tblah approved this pull request. Thanks! https://github.com/llvm/llvm-project/pull/110266 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [Flang][OpenMP] Improve entry block argument creation and binding (PR #110267)
https://github.com/skatrak updated https://github.com/llvm/llvm-project/pull/110267 >From 2c5d74d932797b916b5f0da6fb017b5f4af2b2b4 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Fri, 27 Sep 2024 13:51:27 +0100 Subject: [PATCH] [Flang][OpenMP] Improve entry block argument creation and binding The main purpose of this patch is to centralize the logic for creating MLIR operation entry blocks and for binding them to the corresponding symbols. This minimizes the chances of mixing arguments up for operations having multiple entry block argument-generating clauses and prevents divergence while binding arguments. Some changes implemented to this end are: - Split into two functions the creation of the entry block, and the binding of its arguments and the corresponding Fortran symbol. This enabled a significant simplification of the lowering of composite constructs, where it's no longer necessary to manually ensure the lists of arguments and symbols refer to the same variables in the same order and also match the expected order by the `BlockArgOpenMPOpInterface`. - Removed redundant and error-prone passing of types and locations from `ClauseProcessor` methods. Instead, these are obtained from the values in the appropriate clause operands structure. This also simplifies argument lists of several lowering functions. - Access block arguments of already created MLIR operations through the `BlockArgOpenMPOpInterface` instead of directly indexing the argument list of the operation, which is not scalable as more entry block argument-generating clauses are added to an operation. - Simplified the implementation of `genParallelOp` to no longer need to define different callbacks depending on whether delayed privatization is enabled. --- flang/lib/Lower/OpenMP/ClauseProcessor.cpp| 79 +- flang/lib/Lower/OpenMP/ClauseProcessor.h | 38 +- flang/lib/Lower/OpenMP/OpenMP.cpp | 1016 + flang/lib/Lower/OpenMP/ReductionProcessor.cpp |5 +- flang/lib/Lower/OpenMP/ReductionProcessor.h |3 +- flang/lib/Lower/OpenMP/Utils.cpp |9 +- flang/lib/Lower/OpenMP/Utils.h|4 +- 7 files changed, 554 insertions(+), 600 deletions(-) diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index e9ef8579100e93..44f5ca7f342707 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -166,15 +166,11 @@ getIfClauseOperand(lower::AbstractConverter &converter, static void addUseDeviceClause( lower::AbstractConverter &converter, const omp::ObjectList &objects, llvm::SmallVectorImpl &operands, -llvm::SmallVectorImpl &useDeviceTypes, -llvm::SmallVectorImpl &useDeviceLocs, llvm::SmallVectorImpl &useDeviceSyms) { genObjectList(objects, converter, operands); - for (mlir::Value &operand : operands) { + for (mlir::Value &operand : operands) checkMapType(operand.getLoc(), operand.getType()); -useDeviceTypes.push_back(operand.getType()); -useDeviceLocs.push_back(operand.getLoc()); - } + for (const omp::Object &object : objects) useDeviceSyms.push_back(object.sym()); } @@ -832,14 +828,12 @@ bool ClauseProcessor::processDepend(mlir::omp::DependClauseOps &result) const { bool ClauseProcessor::processHasDeviceAddr( mlir::omp::HasDeviceAddrClauseOps &result, -llvm::SmallVectorImpl &isDeviceTypes, -llvm::SmallVectorImpl &isDeviceLocs, -llvm::SmallVectorImpl &isDeviceSymbols) const { +llvm::SmallVectorImpl &isDeviceSyms) const { return findRepeatableClause( [&](const omp::clause::HasDeviceAddr &devAddrClause, const parser::CharBlock &) { addUseDeviceClause(converter, devAddrClause.v, result.hasDeviceAddrVars, - isDeviceTypes, isDeviceLocs, isDeviceSymbols); + isDeviceSyms); }); } @@ -864,14 +858,12 @@ bool ClauseProcessor::processIf( bool ClauseProcessor::processIsDevicePtr( mlir::omp::IsDevicePtrClauseOps &result, -llvm::SmallVectorImpl &isDeviceTypes, -llvm::SmallVectorImpl &isDeviceLocs, -llvm::SmallVectorImpl &isDeviceSymbols) const { +llvm::SmallVectorImpl &isDeviceSyms) const { return findRepeatableClause( [&](const omp::clause::IsDevicePtr &devPtrClause, const parser::CharBlock &) { addUseDeviceClause(converter, devPtrClause.v, result.isDevicePtrVars, - isDeviceTypes, isDeviceLocs, isDeviceSymbols); + isDeviceSyms); }); } @@ -892,9 +884,7 @@ void ClauseProcessor::processMapObjects( std::map> &parentMemberIndices, llvm::SmallVectorImpl &mapVars, -llvm::SmallVectorImpl *mapSyms, -llvm::SmallVectorImpl *mapSymLocs, -llvm::SmallVectorImpl *mapSymTypes) const { +llvm::SmallVectorImpl &mapSyms) const { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); for
[llvm-branch-commits] [llvm] [AMDGPU] Serialize WWM_REG vreg flag (PR #110229)
@@ -3614,3 +3614,14 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, } return 0; } + +SmallVector arsenm wrote: probably should just be const char*, this will probably only ever be used with literals https://github.com/llvm/llvm-project/pull/110229 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Serialize WWM_REG vreg flag (PR #110229)
@@ -0,0 +1,16 @@ +# RUN: llc -mtriple=amdgcn -run-pass=none -o - %s | FileCheck %s +# This test ensures that the MIR parser parses virtual register flags correctly + +--- +name: vregs +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: vgpr_32, preferred-register: '$vgpr1', flags: [ WWM_REG ] } +# CHECK-NEXT: - { id: 1, class: sgpr_64, preferred-register: '$sgpr0_sgpr1', flags: [ ] } +# CHECK-NEXT: - { id: 2, class: sgpr_64, preferred-register: '', flags: [ ] } +registers: + - { id: 0, class: vgpr_32, preferred-register: $vgpr1, flags: [ WWM_REG ]} + - { id: 1, class: sgpr_64, preferred-register: $sgpr0_sgpr1 } arsenm wrote: Also test an explicitly empty case https://github.com/llvm/llvm-project/pull/110229 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Serialize WWM_REG vreg flag (PR #110229)
@@ -1628,6 +1628,21 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->reserveWWMRegister(ParsedReg); } + auto setRegisterFlags = [&](const VRegInfo &Info) { +for (const auto &Flag : Info.Flags) { + MFI->setFlag(Info.VReg, Flag); +} + }; + + for (const auto &P : PFS.VRegInfosNamed) { +const VRegInfo &Info = *P.second; +setRegisterFlags(Info); + } + for (const auto &P : PFS.VRegInfos) { +const VRegInfo &Info = *P.second; arsenm wrote: c++17 destructuring https://github.com/llvm/llvm-project/pull/110229 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Serialize WWM_REG vreg flag (PR #110229)
@@ -1628,6 +1628,21 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->reserveWWMRegister(ParsedReg); } + auto setRegisterFlags = [&](const VRegInfo &Info) { +for (const auto &Flag : Info.Flags) { + MFI->setFlag(Info.VReg, Flag); +} + }; + + for (const auto &P : PFS.VRegInfosNamed) { +const VRegInfo &Info = *P.second; arsenm wrote: c++17 destructuring https://github.com/llvm/llvm-project/pull/110229 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Serialize WWM_REG vreg flag (PR #110229)
@@ -1628,6 +1628,21 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->reserveWWMRegister(ParsedReg); } + auto setRegisterFlags = [&](const VRegInfo &Info) { +for (const auto &Flag : Info.Flags) { arsenm wrote: No auto, no reference. This is just uint8_t https://github.com/llvm/llvm-project/pull/110229 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Generalize the instruction size checking in AsmPrinter (PR #110108)
atrosinenko wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/110108?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#110108** https://app.graphite.dev/github/pr/llvm/llvm-project/110108?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 * **#110107** https://app.graphite.dev/github/pr/llvm/llvm-project/110107?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about stacking. Join @atrosinenko and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="11px" height="11px"/> Graphite https://github.com/llvm/llvm-project/pull/110108 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [mlir][Transforms][NFC] Dialect Conversion: Simplify `finalize` signature (PR #110419)
https://github.com/zero9178 approved this pull request. https://github.com/llvm/llvm-project/pull/110419 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [BOLT] Support --show-density for fdata and YAML profiles (PR #110567)
https://github.com/aaupov created https://github.com/llvm/llvm-project/pull/110567 None ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Document entry block argument-defining clauses (NFC) (PR #109811)
@@ -285,7 +285,75 @@ argument's type: specific `mlir::Attribute` subclass) will be used instead. - Other attribute types will be represented with their `storageType`. - It will create `Operands` structure for each operation, which is an -empty structure subclassing all operand structures defined for the corresponding `OpenMP_Op`'s clauses. +empty structure subclassing all operand structures defined for the corresponding +`OpenMP_Op`'s clauses. + +### Entry Block Argument-Defining Clauses + +Certain OpenMP clauses introduce in their MLIR representation mappings between +outside values and entry block arguments for the region of the MLIR operation +they are applied to. This enables, for example, the introduction of private bhandarkar-pranav wrote: Please consider ``` This enables, for example, the introduction of private copies of the same underlying variable defined outside the MLIR operation the clause is attached to. ``` https://github.com/llvm/llvm-project/pull/109811 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Document entry block argument-defining clauses (NFC) (PR #109811)
@@ -285,7 +285,75 @@ argument's type: specific `mlir::Attribute` subclass) will be used instead. - Other attribute types will be represented with their `storageType`. - It will create `Operands` structure for each operation, which is an -empty structure subclassing all operand structures defined for the corresponding `OpenMP_Op`'s clauses. +empty structure subclassing all operand structures defined for the corresponding +`OpenMP_Op`'s clauses. + +### Entry Block Argument-Defining Clauses + +Certain OpenMP clauses introduce in their MLIR representation mappings between +outside values and entry block arguments for the region of the MLIR operation +they are applied to. This enables, for example, the introduction of private +copies of the same underlying variable. Currently, clauses with this property +can be classified in three main categories: + - Map-like clauses: `map`, `use_device_addr` and `use_device_ptr`. + - Reduction-like clauses: `in_reduction`, `reduction` and `task_reduction`. + - Privatization clause: `private`. + +All three kinds of entry block argument-defining clauses use a similar custom +assembly format representation, only differing based on the different pieces of +information attached to each kind. Below, one example of each is shown: + +```mlir +omp.target map_entries(%x -> %x.m, %y -> %y.m : !llvm.ptr, !llvm.ptr) { + // Use %x.m, %y.m in place of %x and %y... +} + +omp.wsloop reduction(@add.i32 %x -> %x.r, byref @add.f32 %y -> %y.r : !llvm.ptr, !llvm.ptr) { + // Use %x.r, %y.r in place of %x and %y... +} + +omp.parallel private(@x.privatizer %x -> %x.p, @y.privatizer %y -> %y.p : !llvm.ptr, !llvm.ptr) { + // Use %x.p, %y.p in place of %x and %y... +} +``` + +As a consequence of parsing and printing the operation's first region entry +block argument names together with the custom assembly format of these clauses, +entry block arguments (i.e. the `^bb0(...):` line) must not be explicitly +defined for these operations. Additionally, it is not possible to implement this +feature while allowing each clause to be independently parsed and printed, +because they need to be printed/parsed together with the corresponding +operation's first region. They must have a well-defined ordering in which +multiple of these clauses are specified for a given operation, as well. + +The parsing/printing of these clauses together with the region provides the +ability to define entry block arguments directly after the `->`. Forcing a +specific ordering between these clauses makes the block argument ordering +well-defined, which is the property used to easily match each clause with the +entry block arguments defined by it. + +Custom printers and parsers for operation regions based on the entry block +argument-defining clauses they take are implemented based on the +`{parse,print}BlockArgRegion` functions, which take care of the sorting and +formatting of each kind of clause, minimizing code duplication resulting from +this approach. One example of the custom assembly format of an operation taking +the `private` and `reduction` clauses is the following: + +```tablegen +let assemblyFormat = clausesAssemblyFormat # [{ + custom($region, $private_vars, type($private_vars), + $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref, + $reduction_syms) attr-dict +}]; +``` + +The `BlockArgOpenMPOpInterface` has been introduced to simplify the addition and +handling of these kinds of clauses. It holds `numBlockArgs()` +functions that by default return 0, to be overriden by each clause through the +`extraClassDeclaration` property. Based on these functions and the expected +alphabetical sorting between entry block argument-defining clauses, it bhandarkar-pranav wrote: I am assuming the tablegen backend for openmp that you have implemented doesn't do the sorting and the onus for the alphabetical sorting is on the user, correct? If that's the case i think that expectation must be made explicit, either here or (preferably) in [Adding an Operation](https://mlir.llvm.org/docs/Dialects/OpenMPDialect/#adding-an-operation) https://github.com/llvm/llvm-project/pull/109811 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Document entry block argument-defining clauses (NFC) (PR #109811)
https://github.com/bhandarkar-pranav edited https://github.com/llvm/llvm-project/pull/109811 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Document entry block argument-defining clauses (NFC) (PR #109811)
https://github.com/bhandarkar-pranav commented: LGTM. Given how reviewing docs essentially turns into a series of subjective opinions or preferences, please consider most all of my comments as nits, except the one about alphabetical sorting of clauses. https://github.com/llvm/llvm-project/pull/109811 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Document entry block argument-defining clauses (NFC) (PR #109811)
@@ -285,7 +285,75 @@ argument's type: specific `mlir::Attribute` subclass) will be used instead. - Other attribute types will be represented with their `storageType`. - It will create `Operands` structure for each operation, which is an -empty structure subclassing all operand structures defined for the corresponding `OpenMP_Op`'s clauses. +empty structure subclassing all operand structures defined for the corresponding +`OpenMP_Op`'s clauses. + +### Entry Block Argument-Defining Clauses + +Certain OpenMP clauses introduce in their MLIR representation mappings between bhandarkar-pranav wrote: Could you consider the following reordering and slight rewording of the first sentence? ``` In their MLIR representation, certain OpenMP clauses introduce a mapping between values defined outside the operation they are applied to and entry block arguments for the region of that MLIR operation. ``` https://github.com/llvm/llvm-project/pull/109811 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Document entry block argument-defining clauses (NFC) (PR #109811)
@@ -285,7 +285,75 @@ argument's type: specific `mlir::Attribute` subclass) will be used instead. - Other attribute types will be represented with their `storageType`. - It will create `Operands` structure for each operation, which is an -empty structure subclassing all operand structures defined for the corresponding `OpenMP_Op`'s clauses. +empty structure subclassing all operand structures defined for the corresponding +`OpenMP_Op`'s clauses. + +### Entry Block Argument-Defining Clauses + +Certain OpenMP clauses introduce in their MLIR representation mappings between +outside values and entry block arguments for the region of the MLIR operation +they are applied to. This enables, for example, the introduction of private +copies of the same underlying variable. Currently, clauses with this property +can be classified in three main categories: bhandarkar-pranav wrote: `s/in three/into three` https://github.com/llvm/llvm-project/pull/109811 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][OpenMP] Document entry block argument-defining clauses (NFC) (PR #109811)
@@ -285,7 +285,75 @@ argument's type: specific `mlir::Attribute` subclass) will be used instead. - Other attribute types will be represented with their `storageType`. - It will create `Operands` structure for each operation, which is an -empty structure subclassing all operand structures defined for the corresponding `OpenMP_Op`'s clauses. +empty structure subclassing all operand structures defined for the corresponding +`OpenMP_Op`'s clauses. + +### Entry Block Argument-Defining Clauses + +Certain OpenMP clauses introduce in their MLIR representation mappings between +outside values and entry block arguments for the region of the MLIR operation +they are applied to. This enables, for example, the introduction of private +copies of the same underlying variable. Currently, clauses with this property +can be classified in three main categories: + - Map-like clauses: `map`, `use_device_addr` and `use_device_ptr`. + - Reduction-like clauses: `in_reduction`, `reduction` and `task_reduction`. + - Privatization clause: `private`. bhandarkar-pranav wrote: Ultra Nit: I think it should be `Privatization clauses:` even if the set has a solitary element. https://github.com/llvm/llvm-project/pull/109811 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] a7554df - [LoongArch][ISel] Check the number of sign bits in `PatGprGpr_32` (#107432)
Author: Yingwei Zheng Date: 2024-10-01T08:51:27+02:00 New Revision: a7554dfc222b13624426ebd6ef46e122b9c16ee7 URL: https://github.com/llvm/llvm-project/commit/a7554dfc222b13624426ebd6ef46e122b9c16ee7 DIFF: https://github.com/llvm/llvm-project/commit/a7554dfc222b13624426ebd6ef46e122b9c16ee7.diff LOG: [LoongArch][ISel] Check the number of sign bits in `PatGprGpr_32` (#107432) After https://github.com/llvm/llvm-project/pull/92205, LoongArch ISel selects `div.w` for `trunc i64 (sdiv i64 3202030857, (sext i32 X to i64)) to i32`. It is incorrect since `3202030857` is not a signed 32-bit constant. It will produce wrong result when `X == 2`: https://alive2.llvm.org/ce/z/pzfGZZ This patch adds additional `sexti32` checks to operands of `PatGprGpr_32`. Alive2 proof: https://alive2.llvm.org/ce/z/AkH5Mp Fix #107414. (cherry picked from commit a111f9119a5ec77c19a514ec09454218f739454f) Added: Modified: llvm/lib/Target/LoongArch/LoongArchInstrInfo.td llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll Removed: diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index ef647a42778737..339d50bd819217 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -1065,10 +1065,13 @@ def RDTIME_D : RDTIME_2R<0x6800>; /// Generic pattern classes +def assertsexti32 : PatFrag<(ops node:$src), (assertsext node:$src), [{ + return cast(N->getOperand(1))->getVT().bitsLE(MVT::i32); +}]>; class PatGprGpr : Pat<(OpNode GPR:$rj, GPR:$rk), (Inst GPR:$rj, GPR:$rk)>; class PatGprGpr_32 -: Pat<(sext_inreg (OpNode GPR:$rj, GPR:$rk), i32), (Inst GPR:$rj, GPR:$rk)>; +: Pat<(sext_inreg (OpNode (assertsexti32 GPR:$rj), (assertsexti32 GPR:$rk)), i32), (Inst GPR:$rj, GPR:$rk)>; class PatGpr : Pat<(OpNode GPR:$rj), (Inst GPR:$rj)>; diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll index ab3eec240db3c1..c22acdb4969071 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll @@ -191,7 +191,8 @@ define signext i32 @sdiv_si32_ui32_ui32(i32 %a, i32 %b) { ; LA64: # %bb.0: # %entry ; LA64-NEXT:addi.w $a1, $a1, 0 ; LA64-NEXT:addi.w $a0, $a0, 0 -; LA64-NEXT:div.w $a0, $a0, $a1 +; LA64-NEXT:div.d $a0, $a0, $a1 +; LA64-NEXT:addi.w $a0, $a0, 0 ; LA64-NEXT:ret ; ; LA32-TRAP-LABEL: sdiv_si32_ui32_ui32: @@ -207,11 +208,12 @@ define signext i32 @sdiv_si32_ui32_ui32(i32 %a, i32 %b) { ; LA64-TRAP: # %bb.0: # %entry ; LA64-TRAP-NEXT:addi.w $a1, $a1, 0 ; LA64-TRAP-NEXT:addi.w $a0, $a0, 0 -; LA64-TRAP-NEXT:div.w $a0, $a0, $a1 +; LA64-TRAP-NEXT:div.d $a0, $a0, $a1 ; LA64-TRAP-NEXT:bnez $a1, .LBB5_2 ; LA64-TRAP-NEXT: # %bb.1: # %entry ; LA64-TRAP-NEXT:break 7 ; LA64-TRAP-NEXT: .LBB5_2: # %entry +; LA64-TRAP-NEXT:addi.w $a0, $a0, 0 ; LA64-TRAP-NEXT:ret entry: %r = sdiv i32 %a, %b @@ -1151,3 +1153,64 @@ entry: %r = urem i64 %a, %b ret i64 %r } + +define signext i32 @pr107414(i32 signext %x) { +; LA32-LABEL: pr107414: +; LA32: # %bb.0: # %entry +; LA32-NEXT:addi.w $sp, $sp, -16 +; LA32-NEXT:.cfi_def_cfa_offset 16 +; LA32-NEXT:st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT:.cfi_offset 1, -4 +; LA32-NEXT:move $a2, $a0 +; LA32-NEXT:srai.w $a3, $a0, 31 +; LA32-NEXT:lu12i.w $a0, -266831 +; LA32-NEXT:ori $a0, $a0, 3337 +; LA32-NEXT:move $a1, $zero +; LA32-NEXT:bl %plt(__divdi3) +; LA32-NEXT:ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT:addi.w $sp, $sp, 16 +; LA32-NEXT:ret +; +; LA64-LABEL: pr107414: +; LA64: # %bb.0: # %entry +; LA64-NEXT:lu12i.w $a1, -266831 +; LA64-NEXT:ori $a1, $a1, 3337 +; LA64-NEXT:lu32i.d $a1, 0 +; LA64-NEXT:div.d $a0, $a1, $a0 +; LA64-NEXT:addi.w $a0, $a0, 0 +; LA64-NEXT:ret +; +; LA32-TRAP-LABEL: pr107414: +; LA32-TRAP: # %bb.0: # %entry +; LA32-TRAP-NEXT:addi.w $sp, $sp, -16 +; LA32-TRAP-NEXT:.cfi_def_cfa_offset 16 +; LA32-TRAP-NEXT:st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-TRAP-NEXT:.cfi_offset 1, -4 +; LA32-TRAP-NEXT:move $a2, $a0 +; LA32-TRAP-NEXT:srai.w $a3, $a0, 31 +; LA32-TRAP-NEXT:lu12i.w $a0, -266831 +; LA32-TRAP-NEXT:ori $a0, $a0, 3337 +; LA32-TRAP-NEXT:move $a1, $zero +; LA32-TRAP-NEXT:bl %plt(__divdi3) +; LA32-TRAP-NEXT:ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-TRAP-NEXT:addi.w $sp, $sp, 16 +; LA32-TRAP-NEXT:ret +; +; LA64-TRAP-LABEL: pr107414: +; LA64-TRAP: # %bb.0: # %entry +; LA64-TRAP-NEXT:lu12i.w $a1, -266831 +; LA64-TRAP-NEXT:ori $a1, $a1, 3337 +; LA64-TRAP-NEXT:lu32i.d $a1, 0 +; LA64-TRAP
[llvm-branch-commits] [llvm] 9905852 - [LoongArch] Eliminate the redundant sign extension of division (#107971)
Author: hev Date: 2024-10-01T08:51:27+02:00 New Revision: 99058521d4c80635f60b2c1442b683395e0ee818 URL: https://github.com/llvm/llvm-project/commit/99058521d4c80635f60b2c1442b683395e0ee818 DIFF: https://github.com/llvm/llvm-project/commit/99058521d4c80635f60b2c1442b683395e0ee818.diff LOG: [LoongArch] Eliminate the redundant sign extension of division (#107971) If all incoming values of `div.d` are sign-extended and all users only use the lower 32 bits, then convert them to W versions. Fixes: #107946 (cherry picked from commit 0f47e3aebdd2a4a938468a272ea4224552dbf176) Added: Modified: llvm/lib/Target/LoongArch/LoongArchOptWInstrs.cpp llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll Removed: diff --git a/llvm/lib/Target/LoongArch/LoongArchOptWInstrs.cpp b/llvm/lib/Target/LoongArch/LoongArchOptWInstrs.cpp index abac69054f3b91..ab90409fdf47d0 100644 --- a/llvm/lib/Target/LoongArch/LoongArchOptWInstrs.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchOptWInstrs.cpp @@ -637,6 +637,19 @@ static bool isSignExtendedW(Register SrcReg, const LoongArchSubtarget &ST, break; } return false; +// If all incoming values are sign-extended and all users only use +// the lower 32 bits, then convert them to W versions. +case LoongArch::DIV_D: { + if (!AddRegToWorkList(MI->getOperand(1).getReg())) +return false; + if (!AddRegToWorkList(MI->getOperand(2).getReg())) +return false; + if (hasAllWUsers(*MI, ST, MRI)) { +FixableDef.insert(MI); +break; + } + return false; +} } } @@ -651,6 +664,8 @@ static unsigned getWOp(unsigned Opcode) { return LoongArch::ADDI_W; case LoongArch::ADD_D: return LoongArch::ADD_W; + case LoongArch::DIV_D: +return LoongArch::DIV_W; case LoongArch::LD_D: case LoongArch::LD_WU: return LoongArch::LD_W; diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll index c22acdb4969071..c5af79157eaadc 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll @@ -191,8 +191,7 @@ define signext i32 @sdiv_si32_ui32_ui32(i32 %a, i32 %b) { ; LA64: # %bb.0: # %entry ; LA64-NEXT:addi.w $a1, $a1, 0 ; LA64-NEXT:addi.w $a0, $a0, 0 -; LA64-NEXT:div.d $a0, $a0, $a1 -; LA64-NEXT:addi.w $a0, $a0, 0 +; LA64-NEXT:div.w $a0, $a0, $a1 ; LA64-NEXT:ret ; ; LA32-TRAP-LABEL: sdiv_si32_ui32_ui32: @@ -208,12 +207,11 @@ define signext i32 @sdiv_si32_ui32_ui32(i32 %a, i32 %b) { ; LA64-TRAP: # %bb.0: # %entry ; LA64-TRAP-NEXT:addi.w $a1, $a1, 0 ; LA64-TRAP-NEXT:addi.w $a0, $a0, 0 -; LA64-TRAP-NEXT:div.d $a0, $a0, $a1 +; LA64-TRAP-NEXT:div.w $a0, $a0, $a1 ; LA64-TRAP-NEXT:bnez $a1, .LBB5_2 ; LA64-TRAP-NEXT: # %bb.1: # %entry ; LA64-TRAP-NEXT:break 7 ; LA64-TRAP-NEXT: .LBB5_2: # %entry -; LA64-TRAP-NEXT:addi.w $a0, $a0, 0 ; LA64-TRAP-NEXT:ret entry: %r = sdiv i32 %a, %b ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [LoongArch] Eliminate the redundant sign extension of division (#107971) (PR #109125)
https://github.com/tru updated https://github.com/llvm/llvm-project/pull/109125 >From a7554dfc222b13624426ebd6ef46e122b9c16ee7 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Tue, 10 Sep 2024 09:19:39 +0800 Subject: [PATCH 1/2] [LoongArch][ISel] Check the number of sign bits in `PatGprGpr_32` (#107432) After https://github.com/llvm/llvm-project/pull/92205, LoongArch ISel selects `div.w` for `trunc i64 (sdiv i64 3202030857, (sext i32 X to i64)) to i32`. It is incorrect since `3202030857` is not a signed 32-bit constant. It will produce wrong result when `X == 2`: https://alive2.llvm.org/ce/z/pzfGZZ This patch adds additional `sexti32` checks to operands of `PatGprGpr_32`. Alive2 proof: https://alive2.llvm.org/ce/z/AkH5Mp Fix #107414. (cherry picked from commit a111f9119a5ec77c19a514ec09454218f739454f) --- .../Target/LoongArch/LoongArchInstrInfo.td| 5 +- .../ir-instruction/sdiv-udiv-srem-urem.ll | 67 ++- 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index ef647a42778737..339d50bd819217 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -1065,10 +1065,13 @@ def RDTIME_D : RDTIME_2R<0x6800>; /// Generic pattern classes +def assertsexti32 : PatFrag<(ops node:$src), (assertsext node:$src), [{ + return cast(N->getOperand(1))->getVT().bitsLE(MVT::i32); +}]>; class PatGprGpr : Pat<(OpNode GPR:$rj, GPR:$rk), (Inst GPR:$rj, GPR:$rk)>; class PatGprGpr_32 -: Pat<(sext_inreg (OpNode GPR:$rj, GPR:$rk), i32), (Inst GPR:$rj, GPR:$rk)>; +: Pat<(sext_inreg (OpNode (assertsexti32 GPR:$rj), (assertsexti32 GPR:$rk)), i32), (Inst GPR:$rj, GPR:$rk)>; class PatGpr : Pat<(OpNode GPR:$rj), (Inst GPR:$rj)>; diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll index ab3eec240db3c1..c22acdb4969071 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll @@ -191,7 +191,8 @@ define signext i32 @sdiv_si32_ui32_ui32(i32 %a, i32 %b) { ; LA64: # %bb.0: # %entry ; LA64-NEXT:addi.w $a1, $a1, 0 ; LA64-NEXT:addi.w $a0, $a0, 0 -; LA64-NEXT:div.w $a0, $a0, $a1 +; LA64-NEXT:div.d $a0, $a0, $a1 +; LA64-NEXT:addi.w $a0, $a0, 0 ; LA64-NEXT:ret ; ; LA32-TRAP-LABEL: sdiv_si32_ui32_ui32: @@ -207,11 +208,12 @@ define signext i32 @sdiv_si32_ui32_ui32(i32 %a, i32 %b) { ; LA64-TRAP: # %bb.0: # %entry ; LA64-TRAP-NEXT:addi.w $a1, $a1, 0 ; LA64-TRAP-NEXT:addi.w $a0, $a0, 0 -; LA64-TRAP-NEXT:div.w $a0, $a0, $a1 +; LA64-TRAP-NEXT:div.d $a0, $a0, $a1 ; LA64-TRAP-NEXT:bnez $a1, .LBB5_2 ; LA64-TRAP-NEXT: # %bb.1: # %entry ; LA64-TRAP-NEXT:break 7 ; LA64-TRAP-NEXT: .LBB5_2: # %entry +; LA64-TRAP-NEXT:addi.w $a0, $a0, 0 ; LA64-TRAP-NEXT:ret entry: %r = sdiv i32 %a, %b @@ -1151,3 +1153,64 @@ entry: %r = urem i64 %a, %b ret i64 %r } + +define signext i32 @pr107414(i32 signext %x) { +; LA32-LABEL: pr107414: +; LA32: # %bb.0: # %entry +; LA32-NEXT:addi.w $sp, $sp, -16 +; LA32-NEXT:.cfi_def_cfa_offset 16 +; LA32-NEXT:st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT:.cfi_offset 1, -4 +; LA32-NEXT:move $a2, $a0 +; LA32-NEXT:srai.w $a3, $a0, 31 +; LA32-NEXT:lu12i.w $a0, -266831 +; LA32-NEXT:ori $a0, $a0, 3337 +; LA32-NEXT:move $a1, $zero +; LA32-NEXT:bl %plt(__divdi3) +; LA32-NEXT:ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT:addi.w $sp, $sp, 16 +; LA32-NEXT:ret +; +; LA64-LABEL: pr107414: +; LA64: # %bb.0: # %entry +; LA64-NEXT:lu12i.w $a1, -266831 +; LA64-NEXT:ori $a1, $a1, 3337 +; LA64-NEXT:lu32i.d $a1, 0 +; LA64-NEXT:div.d $a0, $a1, $a0 +; LA64-NEXT:addi.w $a0, $a0, 0 +; LA64-NEXT:ret +; +; LA32-TRAP-LABEL: pr107414: +; LA32-TRAP: # %bb.0: # %entry +; LA32-TRAP-NEXT:addi.w $sp, $sp, -16 +; LA32-TRAP-NEXT:.cfi_def_cfa_offset 16 +; LA32-TRAP-NEXT:st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-TRAP-NEXT:.cfi_offset 1, -4 +; LA32-TRAP-NEXT:move $a2, $a0 +; LA32-TRAP-NEXT:srai.w $a3, $a0, 31 +; LA32-TRAP-NEXT:lu12i.w $a0, -266831 +; LA32-TRAP-NEXT:ori $a0, $a0, 3337 +; LA32-TRAP-NEXT:move $a1, $zero +; LA32-TRAP-NEXT:bl %plt(__divdi3) +; LA32-TRAP-NEXT:ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-TRAP-NEXT:addi.w $sp, $sp, 16 +; LA32-TRAP-NEXT:ret +; +; LA64-TRAP-LABEL: pr107414: +; LA64-TRAP: # %bb.0: # %entry +; LA64-TRAP-NEXT:lu12i.w $a1, -266831 +; LA64-TRAP-NEXT:ori $a1, $a1, 3337 +; LA64-TRAP-NEXT:lu32i.d $a1, 0 +; LA64-TRAP-NEXT:div.d $a1, $a1, $a0 +; LA64-TRAP-NEXT:bnez $a0, .LBB32_2 +; LA64-TRAP-NEXT: # %bb.1: # %entry +; LA64-TRAP-NEXT:break 7
[llvm-branch-commits] [llvm] release/19.x: [LoongArch] Eliminate the redundant sign extension of division (#107971) (PR #109125)
https://github.com/tru closed https://github.com/llvm/llvm-project/pull/109125 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Generalize the instruction size checking in AsmPrinter (PR #110108)
@@ -2546,6 +2510,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { TLSDescCall.setOpcode(AArch64::TLSDESCCALL); TLSDescCall.addOperand(Sym); EmitToStreamer(*OutStreamer, TLSDescCall); +--InstsEmitted; // no code emitted davemgreen wrote: Will this need #ifndef NDEBUG? https://github.com/llvm/llvm-project/pull/110108 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [LoongArch] Fix the assertion for atomic store with 'ptr' type (PR #109915)
https://github.com/tru updated https://github.com/llvm/llvm-project/pull/109915 >From b3734d9f93c1f8d908836a966f77c6792242df99 Mon Sep 17 00:00:00 2001 From: Weining Lu Date: Mon, 19 Aug 2024 16:51:21 +0800 Subject: [PATCH] [LoongArch] Fix the assertion for atomic store with 'ptr' type (cherry picked from commit 63267ca9016aa334b329aa408716456b4e3799c8) --- .../LoongArch/LoongArchISelLowering.cpp | 5 +- .../ir-instruction/load-store-atomic.ll | 119 ++ 2 files changed, 122 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 93edafaff553ba..082b42398c6a71 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -5601,8 +5601,9 @@ bool LoongArchTargetLowering::shouldInsertFencesForAtomic( // On LA64, atomic store operations with IntegerBitWidth of 32 and 64 do not // require fences beacuse we can use amswap_db.[w/d]. - if (isa(I)) { -unsigned Size = I->getOperand(0)->getType()->getIntegerBitWidth(); + Type *Ty = I->getOperand(0)->getType(); + if (isa(I) && Ty->isIntegerTy()) { +unsigned Size = Ty->getIntegerBitWidth(); return (Size == 8 || Size == 16); } diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll index c51fded410e83b..1af2b38d799436 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll @@ -72,6 +72,22 @@ define i64 @load_acquire_i64(ptr %ptr) { ret i64 %val } +define ptr @load_acquire_ptr(ptr %ptr) { +; LA32-LABEL: load_acquire_ptr: +; LA32: # %bb.0: +; LA32-NEXT:ld.w $a0, $a0, 0 +; LA32-NEXT:dbar 20 +; LA32-NEXT:ret +; +; LA64-LABEL: load_acquire_ptr: +; LA64: # %bb.0: +; LA64-NEXT:ld.d $a0, $a0, 0 +; LA64-NEXT:dbar 20 +; LA64-NEXT:ret + %val = load atomic ptr, ptr %ptr acquire, align 8 + ret ptr %val +} + define i8 @load_unordered_i8(ptr %ptr) { ; LA32-LABEL: load_unordered_i8: ; LA32: # %bb.0: @@ -135,6 +151,20 @@ define i64 @load_unordered_i64(ptr %ptr) { ret i64 %val } +define ptr @load_unordered_ptr(ptr %ptr) { +; LA32-LABEL: load_unordered_ptr: +; LA32: # %bb.0: +; LA32-NEXT:ld.w $a0, $a0, 0 +; LA32-NEXT:ret +; +; LA64-LABEL: load_unordered_ptr: +; LA64: # %bb.0: +; LA64-NEXT:ld.d $a0, $a0, 0 +; LA64-NEXT:ret + %val = load atomic ptr, ptr %ptr unordered, align 8 + ret ptr %val +} + define i8 @load_monotonic_i8(ptr %ptr) { ; LA32-LABEL: load_monotonic_i8: ; LA32: # %bb.0: @@ -198,6 +228,20 @@ define i64 @load_monotonic_i64(ptr %ptr) { ret i64 %val } +define ptr @load_monotonic_ptr(ptr %ptr) { +; LA32-LABEL: load_monotonic_ptr: +; LA32: # %bb.0: +; LA32-NEXT:ld.w $a0, $a0, 0 +; LA32-NEXT:ret +; +; LA64-LABEL: load_monotonic_ptr: +; LA64: # %bb.0: +; LA64-NEXT:ld.d $a0, $a0, 0 +; LA64-NEXT:ret + %val = load atomic ptr, ptr %ptr monotonic, align 8 + ret ptr %val +} + define i8 @load_seq_cst_i8(ptr %ptr) { ; LA32-LABEL: load_seq_cst_i8: ; LA32: # %bb.0: @@ -268,6 +312,22 @@ define i64 @load_seq_cst_i64(ptr %ptr) { ret i64 %val } +define ptr @load_seq_cst_ptr(ptr %ptr) { +; LA32-LABEL: load_seq_cst_ptr: +; LA32: # %bb.0: +; LA32-NEXT:ld.w $a0, $a0, 0 +; LA32-NEXT:dbar 16 +; LA32-NEXT:ret +; +; LA64-LABEL: load_seq_cst_ptr: +; LA64: # %bb.0: +; LA64-NEXT:ld.d $a0, $a0, 0 +; LA64-NEXT:dbar 16 +; LA64-NEXT:ret + %val = load atomic ptr, ptr %ptr seq_cst, align 8 + ret ptr %val +} + define void @store_release_i8(ptr %ptr, i8 signext %v) { ; LA32-LABEL: store_release_i8: ; LA32: # %bb.0: @@ -336,6 +396,21 @@ define void @store_release_i64(ptr %ptr, i64 %v) { ret void } +define void @store_release_ptr(ptr %ptr, ptr %v) { +; LA32-LABEL: store_release_ptr: +; LA32: # %bb.0: +; LA32-NEXT:dbar 18 +; LA32-NEXT:st.w $a1, $a0, 0 +; LA32-NEXT:ret +; +; LA64-LABEL: store_release_ptr: +; LA64: # %bb.0: +; LA64-NEXT:amswap_db.d $zero, $a1, $a0 +; LA64-NEXT:ret + store atomic ptr %v, ptr %ptr release, align 8 + ret void +} + define void @store_unordered_i8(ptr %ptr, i8 signext %v) { ; LA32-LABEL: store_unordered_i8: ; LA32: # %bb.0: @@ -399,6 +474,20 @@ define void @store_unordered_i64(ptr %ptr, i64 %v) { ret void } +define void @store_unordered_ptr(ptr %ptr, ptr %v) { +; LA32-LABEL: store_unordered_ptr: +; LA32: # %bb.0: +; LA32-NEXT:st.w $a1, $a0, 0 +; LA32-NEXT:ret +; +; LA64-LABEL: store_unordered_ptr: +; LA64: # %bb.0: +; LA64-NEXT:st.d $a1, $a0, 0 +; LA64-NEXT:ret + store atomic ptr %v, ptr %ptr unordered, align 8 + ret void +} + define void @store_monotonic_i8(ptr %ptr, i8 signext %v) { ; LA32-LABEL: store_monotonic
[llvm-branch-commits] [llvm] b3734d9 - [LoongArch] Fix the assertion for atomic store with 'ptr' type
Author: Weining Lu Date: 2024-10-01T08:53:44+02:00 New Revision: b3734d9f93c1f8d908836a966f77c6792242df99 URL: https://github.com/llvm/llvm-project/commit/b3734d9f93c1f8d908836a966f77c6792242df99 DIFF: https://github.com/llvm/llvm-project/commit/b3734d9f93c1f8d908836a966f77c6792242df99.diff LOG: [LoongArch] Fix the assertion for atomic store with 'ptr' type (cherry picked from commit 63267ca9016aa334b329aa408716456b4e3799c8) Added: Modified: llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll Removed: diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 93edafaff553ba..082b42398c6a71 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -5601,8 +5601,9 @@ bool LoongArchTargetLowering::shouldInsertFencesForAtomic( // On LA64, atomic store operations with IntegerBitWidth of 32 and 64 do not // require fences beacuse we can use amswap_db.[w/d]. - if (isa(I)) { -unsigned Size = I->getOperand(0)->getType()->getIntegerBitWidth(); + Type *Ty = I->getOperand(0)->getType(); + if (isa(I) && Ty->isIntegerTy()) { +unsigned Size = Ty->getIntegerBitWidth(); return (Size == 8 || Size == 16); } diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll index c51fded410e83b..1af2b38d799436 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll @@ -72,6 +72,22 @@ define i64 @load_acquire_i64(ptr %ptr) { ret i64 %val } +define ptr @load_acquire_ptr(ptr %ptr) { +; LA32-LABEL: load_acquire_ptr: +; LA32: # %bb.0: +; LA32-NEXT:ld.w $a0, $a0, 0 +; LA32-NEXT:dbar 20 +; LA32-NEXT:ret +; +; LA64-LABEL: load_acquire_ptr: +; LA64: # %bb.0: +; LA64-NEXT:ld.d $a0, $a0, 0 +; LA64-NEXT:dbar 20 +; LA64-NEXT:ret + %val = load atomic ptr, ptr %ptr acquire, align 8 + ret ptr %val +} + define i8 @load_unordered_i8(ptr %ptr) { ; LA32-LABEL: load_unordered_i8: ; LA32: # %bb.0: @@ -135,6 +151,20 @@ define i64 @load_unordered_i64(ptr %ptr) { ret i64 %val } +define ptr @load_unordered_ptr(ptr %ptr) { +; LA32-LABEL: load_unordered_ptr: +; LA32: # %bb.0: +; LA32-NEXT:ld.w $a0, $a0, 0 +; LA32-NEXT:ret +; +; LA64-LABEL: load_unordered_ptr: +; LA64: # %bb.0: +; LA64-NEXT:ld.d $a0, $a0, 0 +; LA64-NEXT:ret + %val = load atomic ptr, ptr %ptr unordered, align 8 + ret ptr %val +} + define i8 @load_monotonic_i8(ptr %ptr) { ; LA32-LABEL: load_monotonic_i8: ; LA32: # %bb.0: @@ -198,6 +228,20 @@ define i64 @load_monotonic_i64(ptr %ptr) { ret i64 %val } +define ptr @load_monotonic_ptr(ptr %ptr) { +; LA32-LABEL: load_monotonic_ptr: +; LA32: # %bb.0: +; LA32-NEXT:ld.w $a0, $a0, 0 +; LA32-NEXT:ret +; +; LA64-LABEL: load_monotonic_ptr: +; LA64: # %bb.0: +; LA64-NEXT:ld.d $a0, $a0, 0 +; LA64-NEXT:ret + %val = load atomic ptr, ptr %ptr monotonic, align 8 + ret ptr %val +} + define i8 @load_seq_cst_i8(ptr %ptr) { ; LA32-LABEL: load_seq_cst_i8: ; LA32: # %bb.0: @@ -268,6 +312,22 @@ define i64 @load_seq_cst_i64(ptr %ptr) { ret i64 %val } +define ptr @load_seq_cst_ptr(ptr %ptr) { +; LA32-LABEL: load_seq_cst_ptr: +; LA32: # %bb.0: +; LA32-NEXT:ld.w $a0, $a0, 0 +; LA32-NEXT:dbar 16 +; LA32-NEXT:ret +; +; LA64-LABEL: load_seq_cst_ptr: +; LA64: # %bb.0: +; LA64-NEXT:ld.d $a0, $a0, 0 +; LA64-NEXT:dbar 16 +; LA64-NEXT:ret + %val = load atomic ptr, ptr %ptr seq_cst, align 8 + ret ptr %val +} + define void @store_release_i8(ptr %ptr, i8 signext %v) { ; LA32-LABEL: store_release_i8: ; LA32: # %bb.0: @@ -336,6 +396,21 @@ define void @store_release_i64(ptr %ptr, i64 %v) { ret void } +define void @store_release_ptr(ptr %ptr, ptr %v) { +; LA32-LABEL: store_release_ptr: +; LA32: # %bb.0: +; LA32-NEXT:dbar 18 +; LA32-NEXT:st.w $a1, $a0, 0 +; LA32-NEXT:ret +; +; LA64-LABEL: store_release_ptr: +; LA64: # %bb.0: +; LA64-NEXT:amswap_db.d $zero, $a1, $a0 +; LA64-NEXT:ret + store atomic ptr %v, ptr %ptr release, align 8 + ret void +} + define void @store_unordered_i8(ptr %ptr, i8 signext %v) { ; LA32-LABEL: store_unordered_i8: ; LA32: # %bb.0: @@ -399,6 +474,20 @@ define void @store_unordered_i64(ptr %ptr, i64 %v) { ret void } +define void @store_unordered_ptr(ptr %ptr, ptr %v) { +; LA32-LABEL: store_unordered_ptr: +; LA32: # %bb.0: +; LA32-NEXT:st.w $a1, $a0, 0 +; LA32-NEXT:ret +; +; LA64-LABEL: store_unordered_ptr: +; LA64: # %bb.0: +; LA64-NEXT:st.d $a1, $a0, 0 +; LA64-NEXT:ret + store atomic pt
[llvm-branch-commits] [clang] release/19.x: [clang-scan-deps] Don't inspect Args[0] as an option (#109050) (PR #109865)
Martin =?utf-8?q?Storsjö?= , Martin =?utf-8?q?Storsjö?= , Martin =?utf-8?q?Storsjö?= Message-ID: In-Reply-To: github-actions[bot] wrote: @mstorsjo (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. https://github.com/llvm/llvm-project/pull/109865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [LoongArch] Fix the assertion for atomic store with 'ptr' type (PR #109915)
github-actions[bot] wrote: @heiher (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. https://github.com/llvm/llvm-project/pull/109915 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [LoongArch] Fix the assertion for atomic store with 'ptr' type (PR #109915)
https://github.com/tru closed https://github.com/llvm/llvm-project/pull/109915 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] 962edd3 - AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (#110256)
Author: Petar Avramovic Date: 2024-10-01T08:56:50+02:00 New Revision: 962edd3f71eebdcd781222cdd97a561979894003 URL: https://github.com/llvm/llvm-project/commit/962edd3f71eebdcd781222cdd97a561979894003 DIFF: https://github.com/llvm/llvm-project/commit/962edd3f71eebdcd781222cdd97a561979894003.diff LOG: AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (#110256) Use i32 for offset instead of i16, this way it does not get interpreted as negative 16 bit offset. (cherry picked from commit 83fe85115da9dc25fa270d2ea8140113c8d49670) Added: Modified: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp llvm/test/CodeGen/AMDGPU/flat-scratch.ll Removed: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b7471bab128509..7b786ee2641721 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1911,7 +1911,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, 0); } - Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32); return true; } @@ -1967,7 +1967,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) return false; -Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); +Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); return true; } } @@ -2000,7 +2000,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) return false; SAddr = SelectSAddrFI(CurDAG, SAddr); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index f040b47428640a..284f1746145225 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4956,7 +4956,7 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: sgpr_base_large_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT:scratch_load_b32 v2, off, s0 offset:-24 +; GFX12-NEXT:scratch_load_b32 v2, off, s0 offset:65512 ; GFX12-NEXT:s_wait_loadcnt 0x0 ; GFX12-NEXT:global_store_b32 v[0:1], v2, off ; GFX12-NEXT:s_nop 0 @@ -5015,7 +5015,7 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-PAL-LABEL: sgpr_base_large_offset: ; GFX12-PAL: ; %bb.0: ; %entry -; GFX12-PAL-NEXT:scratch_load_b32 v2, off, s0 offset:-24 +; GFX12-PAL-NEXT:scratch_load_b32 v2, off, s0 offset:65512 ; GFX12-PAL-NEXT:s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT:global_store_b32 v[0:1], v2, off ; GFX12-PAL-NEXT:s_nop 0 @@ -5068,7 +5068,7 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT:v_mov_b32_e32 v2, 0x100 ; GFX12-NEXT:s_and_b32 s0, s0, -4 -; GFX12-NEXT:scratch_load_b32 v2, v2, s0 offset:-24 scope:SCOPE_SYS +; GFX12-NEXT:scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS ; GFX12-NEXT:s_wait_loadcnt 0x0 ; GFX12-NEXT:global_store_b32 v[0:1], v2, off ; GFX12-NEXT:s_nop 0 @@ -5133,7 +5133,7 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; GFX12-PAL: ; %bb.0: ; %entry ; GFX12-PAL-NEXT:v_mov_b32_e32 v2, 0x100 ; GFX12-PAL-NEXT:s_and_b32 s0, s0, -4 -; GFX12-PAL-NEXT:scratch_load_b32 v2, v2, s0 offset:-24 scope:SCOPE_SYS +; GFX12-PAL-NEXT:scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS ; GFX12-PAL-NEXT:s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT:global_store_b32 v[0:1], v2, off ; GFX12-PAL-NEXT:s_nop 0 @@ -5189,7 +5189,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT:v_mov_b32_e32 v1, 15 ; GFX12-NEXT:s_add_co_i32 s0, s0, s1 -; GFX12-NEXT:scratch_store_b32 v0, v1, s0 offset:-24 scope:SCOPE_SYS +; GFX12-NEXT:scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS ; GFX12-NEXT:s_wait_storecnt 0x0 ; GFX12-NEXT:s_endpgm ; @@ -5251,7 +5251,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT:v_mov_b32_e32 v1, 15 ; GFX12-PAL-NEXT:s_add_co_i32 s0, s0, s1 -; GFX12-PAL-NEXT:scratch_store_b32 v0, v1, s0 offset:-24 scope:SCOPE_SYS +; GFX12-PAL-NEXT:scratch_store_b32 v0, v1, s0 offset:65512
[llvm-branch-commits] [llvm] 03d1337 - AMDGPU: Add test for 16 bit unsigned scratch offsets (#110255)
Author: Petar Avramovic Date: 2024-10-01T08:56:50+02:00 New Revision: 03d133728ae14704b262c55bbb72ecd9d048add5 URL: https://github.com/llvm/llvm-project/commit/03d133728ae14704b262c55bbb72ecd9d048add5 DIFF: https://github.com/llvm/llvm-project/commit/03d133728ae14704b262c55bbb72ecd9d048add5.diff LOG: AMDGPU: Add test for 16 bit unsigned scratch offsets (#110255) Large scratch offset with one on highest bit selected as negative, negative offset has same binary representation in 16 bits as large unsigned offset. (cherry picked from commit e9d12a6b451bd403d95105aa976a011dc821f126) Added: Modified: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll llvm/test/CodeGen/AMDGPU/flat-scratch.ll Removed: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index a5e4151bf36958..47ca6f416b02b0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -1513,4 +1513,243 @@ bb: ret void } +define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { +; GFX9-LABEL: sgpr_base_large_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT:s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT:s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT:s_add_u32 s0, s2, 0xffe8 +; GFX9-NEXT:scratch_load_dword v2, off, s0 +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:global_store_dword v[0:1], v2, off +; GFX9-NEXT:s_endpgm +; +; GFX10-LABEL: sgpr_base_large_offset: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT:s_add_u32 s0, s0, s5 +; GFX10-NEXT:s_addc_u32 s1, s1, 0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT:s_add_u32 s0, s2, 0xffe8 +; GFX10-NEXT:scratch_load_dword v2, off, s0 +; GFX10-NEXT:s_waitcnt vmcnt(0) +; GFX10-NEXT:global_store_dword v[0:1], v2, off +; GFX10-NEXT:s_endpgm +; +; GFX940-LABEL: sgpr_base_large_offset: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT:s_add_u32 s0, s0, 0xffe8 +; GFX940-NEXT:scratch_load_dword v2, off, s0 +; GFX940-NEXT:s_waitcnt vmcnt(0) +; GFX940-NEXT:global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT:s_endpgm +; +; GFX11-LABEL: sgpr_base_large_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT:s_add_u32 s0, s0, 0xffe8 +; GFX11-NEXT:scratch_load_b32 v2, off, s0 +; GFX11-NEXT:s_waitcnt vmcnt(0) +; GFX11-NEXT:global_store_b32 v[0:1], v2, off +; GFX11-NEXT:s_nop 0 +; GFX11-NEXT:s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT:s_endpgm +; +; GFX12-LABEL: sgpr_base_large_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT:scratch_load_b32 v2, off, s0 offset:65512 +; GFX12-NEXT:s_wait_loadcnt 0x0 +; GFX12-NEXT:global_store_b32 v[0:1], v2, off +; GFX12-NEXT:s_nop 0 +; GFX12-NEXT:s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT:s_endpgm +entry: + %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512 + %load = load i32, ptr addrspace(5) %large_offset, align 4 + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { +; GFX9-LABEL: sgpr_base_large_offset_split: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT:s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT:s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT:s_and_b32 s0, s2, -4 +; GFX9-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX9-NEXT:scratch_load_dword v2, off, s0 glc +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:global_store_dword v[0:1], v2, off +; GFX9-NEXT:s_endpgm +; +; GFX10-LABEL: sgpr_base_large_offset_split: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT:s_add_u32 s0, s0, s5 +; GFX10-NEXT:s_addc_u32 s1, s1, 0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT:s_and_b32 s0, s2, -4 +; GFX10-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX10-NEXT:scratch_load_dword v2, off, s0 glc dlc +; GFX10-NEXT:s_waitcnt vmcnt(0) +; GFX10-NEXT:global_store_dword v[0:1], v2, off +; GFX10-NEXT:s_endpgm +; +; GFX940-LABEL: sgpr_base_large_offset_split: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT:s_and_b32 s0, s0, -4 +; GFX940-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX940-NEXT:scratch_load_dword v2, off, s0 sc0 sc1 +; GFX940-NEXT:s_waitcnt vmcnt(0) +; GFX940-NEXT:global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT:s_endpgm +; +; GFX11-LABEL: sgpr_base_large_offset_split: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT:s_and_b32 s0, s0, -4 +; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX11-NEXT:s
[llvm-branch-commits] [llvm] release/19.x: AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (#110256) (PR #110470)
https://github.com/tru updated https://github.com/llvm/llvm-project/pull/110470 >From 03d133728ae14704b262c55bbb72ecd9d048add5 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Mon, 30 Sep 2024 10:39:17 +0200 Subject: [PATCH 1/2] AMDGPU: Add test for 16 bit unsigned scratch offsets (#110255) Large scratch offset with one on highest bit selected as negative, negative offset has same binary representation in 16 bits as large unsigned offset. (cherry picked from commit e9d12a6b451bd403d95105aa976a011dc821f126) --- .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 239 ++ llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 444 ++ 2 files changed, 683 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index a5e4151bf36958..47ca6f416b02b0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -1513,4 +1513,243 @@ bb: ret void } +define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { +; GFX9-LABEL: sgpr_base_large_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT:s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT:s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT:s_add_u32 s0, s2, 0xffe8 +; GFX9-NEXT:scratch_load_dword v2, off, s0 +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:global_store_dword v[0:1], v2, off +; GFX9-NEXT:s_endpgm +; +; GFX10-LABEL: sgpr_base_large_offset: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT:s_add_u32 s0, s0, s5 +; GFX10-NEXT:s_addc_u32 s1, s1, 0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT:s_add_u32 s0, s2, 0xffe8 +; GFX10-NEXT:scratch_load_dword v2, off, s0 +; GFX10-NEXT:s_waitcnt vmcnt(0) +; GFX10-NEXT:global_store_dword v[0:1], v2, off +; GFX10-NEXT:s_endpgm +; +; GFX940-LABEL: sgpr_base_large_offset: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT:s_add_u32 s0, s0, 0xffe8 +; GFX940-NEXT:scratch_load_dword v2, off, s0 +; GFX940-NEXT:s_waitcnt vmcnt(0) +; GFX940-NEXT:global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT:s_endpgm +; +; GFX11-LABEL: sgpr_base_large_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT:s_add_u32 s0, s0, 0xffe8 +; GFX11-NEXT:scratch_load_b32 v2, off, s0 +; GFX11-NEXT:s_waitcnt vmcnt(0) +; GFX11-NEXT:global_store_b32 v[0:1], v2, off +; GFX11-NEXT:s_nop 0 +; GFX11-NEXT:s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT:s_endpgm +; +; GFX12-LABEL: sgpr_base_large_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT:scratch_load_b32 v2, off, s0 offset:65512 +; GFX12-NEXT:s_wait_loadcnt 0x0 +; GFX12-NEXT:global_store_b32 v[0:1], v2, off +; GFX12-NEXT:s_nop 0 +; GFX12-NEXT:s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT:s_endpgm +entry: + %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512 + %load = load i32, ptr addrspace(5) %large_offset, align 4 + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) { +; GFX9-LABEL: sgpr_base_large_offset_split: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT:s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT:s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT:s_and_b32 s0, s2, -4 +; GFX9-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX9-NEXT:scratch_load_dword v2, off, s0 glc +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:global_store_dword v[0:1], v2, off +; GFX9-NEXT:s_endpgm +; +; GFX10-LABEL: sgpr_base_large_offset_split: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT:s_add_u32 s0, s0, s5 +; GFX10-NEXT:s_addc_u32 s1, s1, 0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT:s_and_b32 s0, s2, -4 +; GFX10-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX10-NEXT:scratch_load_dword v2, off, s0 glc dlc +; GFX10-NEXT:s_waitcnt vmcnt(0) +; GFX10-NEXT:global_store_dword v[0:1], v2, off +; GFX10-NEXT:s_endpgm +; +; GFX940-LABEL: sgpr_base_large_offset_split: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT:s_and_b32 s0, s0, -4 +; GFX940-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX940-NEXT:scratch_load_dword v2, off, s0 sc0 sc1 +; GFX940-NEXT:s_waitcnt vmcnt(0) +; GFX940-NEXT:global_store_dword v[0:1], v2, off sc0 sc1 +; GFX940-NEXT:s_endpgm +; +; GFX11-LABEL: sgpr_base_large_offset_split: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT:s_and_b32 s0, s0, -4 +; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT:s_add_u32 s0, s0, 0x100ffe8 +; GFX11-NEXT:scratch_load_b32 v2, off, s0 glc dlc +; GFX11-NEXT:s_waitcnt vmcnt(0) +; GFX11-NEXT:global_store_b32 v[0:1], v2,
[llvm-branch-commits] [libcxx] [release/19.x][libc++] Fix AppleClang version number when checking for __builtin_verbose_trap support (PR #110263)
https://github.com/tru updated https://github.com/llvm/llvm-project/pull/110263 >From 53010fcf66b5a84153bce6b7e866edb596e59cf4 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 27 Sep 2024 08:53:02 -0400 Subject: [PATCH] [libc++] Fix AppleClang version number when checking for __builtin_verbose_trap support (#110161) We should have been checking against 1700, not 17000, which was a typo. (cherry picked from commit 1eba87904b0cbaaee82cfdb835528b85d99320ef) --- libcxx/vendor/llvm/default_assertion_handler.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libcxx/vendor/llvm/default_assertion_handler.in b/libcxx/vendor/llvm/default_assertion_handler.in index 3b6d6b2cca53c2..e12daff37f 100644 --- a/libcxx/vendor/llvm/default_assertion_handler.in +++ b/libcxx/vendor/llvm/default_assertion_handler.in @@ -26,7 +26,8 @@ # if __has_builtin(__builtin_verbose_trap) // AppleClang shipped a slightly different version of __builtin_verbose_trap from the upstream // version before upstream Clang actually got the builtin. -#if defined(_LIBCPP_APPLE_CLANG_VER) && _LIBCPP_APPLE_CLANG_VER < 17000 +// TODO: Remove once AppleClang supports the two-arguments version of the builtin. +#if defined(_LIBCPP_APPLE_CLANG_VER) && _LIBCPP_APPLE_CLANG_VER < 1700 # define _LIBCPP_ASSERTION_HANDLER(message) __builtin_verbose_trap(message) #else # define _LIBCPP_ASSERTION_HANDLER(message) __builtin_verbose_trap("libc++", message) ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] [release/19.x][libc++] Fix AppleClang version number when checking for __builtin_verbose_trap support (PR #110263)
https://github.com/tru closed https://github.com/llvm/llvm-project/pull/110263 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] 53010fc - [libc++] Fix AppleClang version number when checking for __builtin_verbose_trap support (#110161)
Author: Louis Dionne Date: 2024-10-01T08:56:18+02:00 New Revision: 53010fcf66b5a84153bce6b7e866edb596e59cf4 URL: https://github.com/llvm/llvm-project/commit/53010fcf66b5a84153bce6b7e866edb596e59cf4 DIFF: https://github.com/llvm/llvm-project/commit/53010fcf66b5a84153bce6b7e866edb596e59cf4.diff LOG: [libc++] Fix AppleClang version number when checking for __builtin_verbose_trap support (#110161) We should have been checking against 1700, not 17000, which was a typo. (cherry picked from commit 1eba87904b0cbaaee82cfdb835528b85d99320ef) Added: Modified: libcxx/vendor/llvm/default_assertion_handler.in Removed: diff --git a/libcxx/vendor/llvm/default_assertion_handler.in b/libcxx/vendor/llvm/default_assertion_handler.in index 3b6d6b2cca53c2..e12daff37f 100644 --- a/libcxx/vendor/llvm/default_assertion_handler.in +++ b/libcxx/vendor/llvm/default_assertion_handler.in @@ -26,7 +26,8 @@ # if __has_builtin(__builtin_verbose_trap) // AppleClang shipped a slightly diff erent version of __builtin_verbose_trap from the upstream // version before upstream Clang actually got the builtin. -#if defined(_LIBCPP_APPLE_CLANG_VER) && _LIBCPP_APPLE_CLANG_VER < 17000 +// TODO: Remove once AppleClang supports the two-arguments version of the builtin. +#if defined(_LIBCPP_APPLE_CLANG_VER) && _LIBCPP_APPLE_CLANG_VER < 1700 # define _LIBCPP_ASSERTION_HANDLER(message) __builtin_verbose_trap(message) #else # define _LIBCPP_ASSERTION_HANDLER(message) __builtin_verbose_trap("libc++", message) ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (#110256) (PR #110470)
https://github.com/tru closed https://github.com/llvm/llvm-project/pull/110470 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [loongarch][DAG][FREEZE] Fix crash when FREEZE a half(f16) type on loongarch (#107791) (PR #109093)
https://github.com/tru closed https://github.com/llvm/llvm-project/pull/109093 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [loongarch][DAG][FREEZE] Fix crash when FREEZE a half(f16) type on loongarch (#107791) (PR #109093)
tru wrote: Sounds to me like we don't have to accept this patch into the 19.1 release. Please re-open and argue if you don't agree with that assesment. https://github.com/llvm/llvm-project/pull/109093 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] [release/19.x][libc++] Fix AppleClang version number when checking for __builtin_verbose_trap support (PR #110263)
github-actions[bot] wrote: @ldionne (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. https://github.com/llvm/llvm-project/pull/110263 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] Backport "[Clang][CodeGen] Fix type for atomic float incdec operators (#107075)" (PR #107184)
https://github.com/tru updated https://github.com/llvm/llvm-project/pull/107184 >From 149bfdd61c961edbf49c2ea7fadf9d3c1a79a55e Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 4 Sep 2024 12:19:46 +0800 Subject: [PATCH] [Clang][CodeGen] Fix type for atomic float incdec operators (#107075) `llvm::ConstantFP::get(llvm::LLVMContext&, APFloat(float))` always returns a f32 constant. Fix https://github.com/llvm/llvm-project/issues/107054. --- clang/lib/CodeGen/CGExprScalar.cpp| 26 +- clang/test/CodeGen/X86/x86-atomic-double.c| 88 +++--- .../test/CodeGen/X86/x86-atomic-long_double.c | 293 ++ 3 files changed, 300 insertions(+), 107 deletions(-) diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index a17d68424bbce5..6e212e74676e8d 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -2833,18 +2833,22 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV, llvm::AtomicOrdering::SequentiallyConsistent); return isPre ? Builder.CreateBinOp(op, old, amt) : old; } -// Special case for atomic increment/decrement on floats +// Special case for atomic increment/decrement on floats. +// Bail out non-power-of-2-sized floating point types (e.g., x86_fp80). if (type->isFloatingType()) { - llvm::AtomicRMWInst::BinOp aop = - isInc ? llvm::AtomicRMWInst::FAdd : llvm::AtomicRMWInst::FSub; - llvm::Instruction::BinaryOps op = - isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub; - llvm::Value *amt = llvm::ConstantFP::get( - VMContext, llvm::APFloat(static_cast(1.0))); - llvm::Value *old = - Builder.CreateAtomicRMW(aop, LV.getAddress(), amt, - llvm::AtomicOrdering::SequentiallyConsistent); - return isPre ? Builder.CreateBinOp(op, old, amt) : old; + llvm::Type *Ty = ConvertType(type); + if (llvm::has_single_bit(Ty->getScalarSizeInBits())) { +llvm::AtomicRMWInst::BinOp aop = +isInc ? llvm::AtomicRMWInst::FAdd : llvm::AtomicRMWInst::FSub; +llvm::Instruction::BinaryOps op = +isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub; +llvm::Value *amt = llvm::ConstantFP::get(Ty, 1.0); +llvm::AtomicRMWInst *old = Builder.CreateAtomicRMW( +aop, LV.getAddress(), amt, +llvm::AtomicOrdering::SequentiallyConsistent); + +return isPre ? Builder.CreateBinOp(op, old, amt) : old; + } } value = EmitLoadOfLValue(LV, E->getExprLoc()); input = value; diff --git a/clang/test/CodeGen/X86/x86-atomic-double.c b/clang/test/CodeGen/X86/x86-atomic-double.c index 2354c89cc2b170..09c8f70c3db854 100644 --- a/clang/test/CodeGen/X86/x86-atomic-double.c +++ b/clang/test/CodeGen/X86/x86-atomic-double.c @@ -6,20 +6,14 @@ // X64-LABEL: define dso_local double @test_double_post_inc( // X64-SAME: ) #[[ATTR0:[0-9]+]] { // X64-NEXT: entry: -// X64-NEXT:[[RETVAL:%.*]] = alloca double, align 8 -// X64-NEXT:[[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.00e+00 seq_cst, align 8 -// X64-NEXT:store float [[TMP0]], ptr [[RETVAL]], align 8 -// X64-NEXT:[[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8 -// X64-NEXT:ret double [[TMP1]] +// X64-NEXT:[[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, double 1.00e+00 seq_cst, align 8 +// X64-NEXT:ret double [[TMP0]] // // X86-LABEL: define dso_local double @test_double_post_inc( // X86-SAME: ) #[[ATTR0:[0-9]+]] { // X86-NEXT: entry: -// X86-NEXT:[[RETVAL:%.*]] = alloca double, align 4 -// X86-NEXT:[[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.00e+00 seq_cst, align 8 -// X86-NEXT:store float [[TMP0]], ptr [[RETVAL]], align 4 -// X86-NEXT:[[TMP1:%.*]] = load double, ptr [[RETVAL]], align 4 -// X86-NEXT:ret double [[TMP1]] +// X86-NEXT:[[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, double 1.00e+00 seq_cst, align 8 +// X86-NEXT:ret double [[TMP0]] // double test_double_post_inc() { @@ -30,20 +24,14 @@ double test_double_post_inc() // X64-LABEL: define dso_local double @test_double_post_dc( // X64-SAME: ) #[[ATTR0]] { // X64-NEXT: entry: -// X64-NEXT:[[RETVAL:%.*]] = alloca double, align 8 -// X64-NEXT:[[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, float 1.00e+00 seq_cst, align 8 -// X64-NEXT:store float [[TMP0]], ptr [[RETVAL]], align 8 -// X64-NEXT:[[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8 -// X64-NEXT:ret double [[TMP1]] +// X64-NEXT:[[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, double 1.00e+00 seq_cst, align 8 +// X64-NEXT:ret double [[TMP0]] // // X86-LABEL: define dso_local double @test_double_post_dc( // X86-SAME: ) #[[ATTR0]] { // X86-NEXT: entry: -// X86-NEXT:[[RETVAL:%.*]]
[llvm-branch-commits] [libcxx] [release/19.x] Cherry-picks to fix the zdump spurious failures in CI (PR #110259)
https://github.com/tru updated https://github.com/llvm/llvm-project/pull/110259 >From be879942edbb27b4a43eb6b5f4162153972fc41a Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 15 Aug 2024 08:14:13 + Subject: [PATCH 1/2] [lldb][test] Mark sys_info zdump test unsupported on 32 bit Arm Linux Until https://github.com/llvm/llvm-project/pull/103056 lands or another more appropriate check can be found. This test fails on Ubuntu Focal where zdump is built with 32 bit time_t but passes on Ubuntu Jammy where zdump is built with 64 bit time_t. Marking it unsupported means Linaro can upgrade its bots to Ubuntu Jammy without getting an unexpected pass. (cherry picked from commit 6f6422f4a2b8647a59936c131e50a79906d89510) --- .../time.zone.members/sys_info.zdump.pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp index 207f8e4df45413..2b97d9a5bc745b 100644 --- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp @@ -14,7 +14,7 @@ // XFAIL: availability-tzdb-missing // TODO TZDB Investigate -// XFAIL: target={{armv(7|8)l-linux-gnueabihf}} +// UNSUPPORTED: target={{armv(7|8)l-linux-gnueabihf}} #include #include >From b43662ee7cdcf96d9eeda3c5d6707c6fb08ed3dc Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 13 Sep 2024 09:14:53 +0100 Subject: [PATCH 2/2] [libcxx][test] Use smaller time range for 32 bit time_t (#104762) This fixes the test on Arm 32 bit Ubuntu Jammy where time_t is 32 bit. (cherry picked from commit cdd608b8f0ce090b3568238387df368751bdbb5d) --- .../time.zone.members/sys_info.zdump.pass.cpp| 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp index 2b97d9a5bc745b..b474fe50083b1d 100644 --- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp @@ -13,9 +13,6 @@ // XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing -// TODO TZDB Investigate -// UNSUPPORTED: target={{armv(7|8)l-linux-gnueabihf}} - #include #include #include @@ -28,7 +25,7 @@ // The year range to validate. The dates used in practice are expected to be // inside the tested range. constexpr std::chrono::year first{1800}; -constexpr std::chrono::year last{2100}; +constexpr std::chrono::year last{sizeof(time_t) == 8 ? 2100 : 2037}; // A custom sys_info class that also stores the name of the time zone. // Its formatter matches the output of zdump. ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] [release/19.x][libc++] Disable the clang-tidy checks to get CI back (#109989) (PR #110162)
github-actions[bot] wrote: @ldionne (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. https://github.com/llvm/llvm-project/pull/110162 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] [release/19.x][libc++] Disable the clang-tidy checks to get CI back (#109989) (PR #110162)
https://github.com/tru closed https://github.com/llvm/llvm-project/pull/110162 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] Backport "[Clang][CodeGen] Fix type for atomic float incdec operators (#107075)" (PR #107184)
https://github.com/tru closed https://github.com/llvm/llvm-project/pull/107184 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] release/19.x: [clang-scan-deps] Don't inspect Args[0] as an option (#109050) (PR #109865)
Martin =?utf-8?q?Storsjö?= , Martin =?utf-8?q?Storsjö?= , Martin =?utf-8?q?Storsjö?= Message-ID: In-Reply-To: https://github.com/tru updated https://github.com/llvm/llvm-project/pull/109865 >From 7d1f2065d68795b6fc6de4953f9f0ac719cf1c65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 12 Sep 2024 22:20:14 +0300 Subject: [PATCH 1/4] [clang-scan-deps] Infer the target from the executable name (#108189) This allows clang-scan-deps to work correctly when using cross compilers with names like -clang. (cherry picked from commit 87e1104cf0e2de0d04bee2944893fa7897277b2f) --- clang/test/ClangScanDeps/implicit-target.c| 31 +++ clang/tools/clang-scan-deps/ClangScanDeps.cpp | 5 +++ 2 files changed, 36 insertions(+) create mode 100644 clang/test/ClangScanDeps/implicit-target.c diff --git a/clang/test/ClangScanDeps/implicit-target.c b/clang/test/ClangScanDeps/implicit-target.c new file mode 100644 index 00..cf757f937331a6 --- /dev/null +++ b/clang/test/ClangScanDeps/implicit-target.c @@ -0,0 +1,31 @@ +// Check that we can detect an implicit target when clang is invoked as +// clang. Using an implicit triple requires that the target actually +// is available, too. +// REQUIRES: x86-registered-target + +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.in > %t/cdb.json + +// Check that we can deduce this both when using a compilation database, and when using +// a literal command line. + +// RUN: clang-scan-deps -format experimental-full -compilation-database %t/cdb.json | FileCheck %s + +// RUN: clang-scan-deps -format experimental-full -- x86_64-w64-mingw32-clang %t/source.c -o %t/source.o | FileCheck %s + +// CHECK: "-triple", +// CHECK-NEXT: "x86_64-w64-windows-gnu", + + +//--- cdb.json.in +[ + { +"directory": "DIR" +"command": "x86_64-w64-mingw32-clang -c DIR/source.c -o DIR/source.o" +"file": "DIR/source.c" + }, +] + +//--- source.c +void func(void) {} diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index a8f6150dd3493d..cd6dd2620152a6 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -15,6 +15,7 @@ #include "clang/Tooling/DependencyScanning/DependencyScanningTool.h" #include "clang/Tooling/DependencyScanning/DependencyScanningWorker.h" #include "clang/Tooling/JSONCompilationDatabase.h" +#include "clang/Tooling/Tooling.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/CommandLine.h" @@ -24,6 +25,7 @@ #include "llvm/Support/LLVMDriver.h" #include "llvm/Support/Program.h" #include "llvm/Support/Signals.h" +#include "llvm/Support/TargetSelect.h" #include "llvm/Support/ThreadPool.h" #include "llvm/Support/Threading.h" #include "llvm/Support/Timer.h" @@ -795,6 +797,7 @@ getCompilationDatabase(int argc, char **argv, std::string &ErrorMessage) { } int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { + llvm::InitializeAllTargetInfos(); std::string ErrorMessage; std::unique_ptr Compilations = getCompilationDatabase(argc, argv, ErrorMessage); @@ -810,6 +813,8 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { Compilations = expandResponseFiles(std::move(Compilations), llvm::vfs::getRealFileSystem()); + Compilations = inferTargetAndDriverMode(std::move(Compilations)); + // The command options are rewritten to run Clang in preprocessor only mode. auto AdjustingCompilations = std::make_unique( >From a0fc8a2b2b85a70c8c523ff2d1fe4ef2e86cda7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 12 Sep 2024 23:11:27 +0300 Subject: [PATCH 2/4] [clang-scan-deps] Fix builds with BUILD_SHARED_LIBS=ON This fixes building in this configuration after 87e1104cf0e2de0d04bee2944893fa7897277b2f. (cherry picked from commit aa3465793a250faa5426ac626989375465256658) --- clang/tools/clang-scan-deps/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/tools/clang-scan-deps/CMakeLists.txt b/clang/tools/clang-scan-deps/CMakeLists.txt index f0be6a546ff882..10bc0ff23c5482 100644 --- a/clang/tools/clang-scan-deps/CMakeLists.txt +++ b/clang/tools/clang-scan-deps/CMakeLists.txt @@ -1,4 +1,5 @@ set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} Core Option Support >From 2b6c23303f7c3f6397003cdac4be6e9e6b78d957 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 13 Sep 2024 23:18:10 +0300 Subject: [PATCH 3/4] [clang-scan-deps] Infer the tool locations from PATH (#108539) This allows the clang driver to know which tool is meant to be executed, which allows the clang driver to load the right clang config files, and allows clang to find colocated sysroots. This makes sure that doing `clang-scan-deps -- ...` looks up things in the same way as if one
[llvm-branch-commits] [clang] 7d1f206 - [clang-scan-deps] Infer the target from the executable name (#108189)
Author: Martin Storsjö Date: 2024-10-01T08:53:03+02:00 New Revision: 7d1f2065d68795b6fc6de4953f9f0ac719cf1c65 URL: https://github.com/llvm/llvm-project/commit/7d1f2065d68795b6fc6de4953f9f0ac719cf1c65 DIFF: https://github.com/llvm/llvm-project/commit/7d1f2065d68795b6fc6de4953f9f0ac719cf1c65.diff LOG: [clang-scan-deps] Infer the target from the executable name (#108189) This allows clang-scan-deps to work correctly when using cross compilers with names like -clang. (cherry picked from commit 87e1104cf0e2de0d04bee2944893fa7897277b2f) Added: clang/test/ClangScanDeps/implicit-target.c Modified: clang/tools/clang-scan-deps/ClangScanDeps.cpp Removed: diff --git a/clang/test/ClangScanDeps/implicit-target.c b/clang/test/ClangScanDeps/implicit-target.c new file mode 100644 index 00..cf757f937331a6 --- /dev/null +++ b/clang/test/ClangScanDeps/implicit-target.c @@ -0,0 +1,31 @@ +// Check that we can detect an implicit target when clang is invoked as +// clang. Using an implicit triple requires that the target actually +// is available, too. +// REQUIRES: x86-registered-target + +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.in > %t/cdb.json + +// Check that we can deduce this both when using a compilation database, and when using +// a literal command line. + +// RUN: clang-scan-deps -format experimental-full -compilation-database %t/cdb.json | FileCheck %s + +// RUN: clang-scan-deps -format experimental-full -- x86_64-w64-mingw32-clang %t/source.c -o %t/source.o | FileCheck %s + +// CHECK: "-triple", +// CHECK-NEXT: "x86_64-w64-windows-gnu", + + +//--- cdb.json.in +[ + { +"directory": "DIR" +"command": "x86_64-w64-mingw32-clang -c DIR/source.c -o DIR/source.o" +"file": "DIR/source.c" + }, +] + +//--- source.c +void func(void) {} diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index a8f6150dd3493d..cd6dd2620152a6 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -15,6 +15,7 @@ #include "clang/Tooling/DependencyScanning/DependencyScanningTool.h" #include "clang/Tooling/DependencyScanning/DependencyScanningWorker.h" #include "clang/Tooling/JSONCompilationDatabase.h" +#include "clang/Tooling/Tooling.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/CommandLine.h" @@ -24,6 +25,7 @@ #include "llvm/Support/LLVMDriver.h" #include "llvm/Support/Program.h" #include "llvm/Support/Signals.h" +#include "llvm/Support/TargetSelect.h" #include "llvm/Support/ThreadPool.h" #include "llvm/Support/Threading.h" #include "llvm/Support/Timer.h" @@ -795,6 +797,7 @@ getCompilationDatabase(int argc, char **argv, std::string &ErrorMessage) { } int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { + llvm::InitializeAllTargetInfos(); std::string ErrorMessage; std::unique_ptr Compilations = getCompilationDatabase(argc, argv, ErrorMessage); @@ -810,6 +813,8 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { Compilations = expandResponseFiles(std::move(Compilations), llvm::vfs::getRealFileSystem()); + Compilations = inferTargetAndDriverMode(std::move(Compilations)); + // The command options are rewritten to run Clang in preprocessor only mode. auto AdjustingCompilations = std::make_unique( ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] a0fc8a2 - [clang-scan-deps] Fix builds with BUILD_SHARED_LIBS=ON
Author: Martin Storsjö Date: 2024-10-01T08:53:03+02:00 New Revision: a0fc8a2b2b85a70c8c523ff2d1fe4ef2e86cda7f URL: https://github.com/llvm/llvm-project/commit/a0fc8a2b2b85a70c8c523ff2d1fe4ef2e86cda7f DIFF: https://github.com/llvm/llvm-project/commit/a0fc8a2b2b85a70c8c523ff2d1fe4ef2e86cda7f.diff LOG: [clang-scan-deps] Fix builds with BUILD_SHARED_LIBS=ON This fixes building in this configuration after 87e1104cf0e2de0d04bee2944893fa7897277b2f. (cherry picked from commit aa3465793a250faa5426ac626989375465256658) Added: Modified: clang/tools/clang-scan-deps/CMakeLists.txt Removed: diff --git a/clang/tools/clang-scan-deps/CMakeLists.txt b/clang/tools/clang-scan-deps/CMakeLists.txt index f0be6a546ff882..10bc0ff23c5482 100644 --- a/clang/tools/clang-scan-deps/CMakeLists.txt +++ b/clang/tools/clang-scan-deps/CMakeLists.txt @@ -1,4 +1,5 @@ set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} Core Option Support ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] 2b6c233 - [clang-scan-deps] Infer the tool locations from PATH (#108539)
Author: Martin Storsjö Date: 2024-10-01T08:53:03+02:00 New Revision: 2b6c23303f7c3f6397003cdac4be6e9e6b78d957 URL: https://github.com/llvm/llvm-project/commit/2b6c23303f7c3f6397003cdac4be6e9e6b78d957 DIFF: https://github.com/llvm/llvm-project/commit/2b6c23303f7c3f6397003cdac4be6e9e6b78d957.diff LOG: [clang-scan-deps] Infer the tool locations from PATH (#108539) This allows the clang driver to know which tool is meant to be executed, which allows the clang driver to load the right clang config files, and allows clang to find colocated sysroots. This makes sure that doing `clang-scan-deps -- ...` looks up things in the same way as if one just would execute ` ...`, when `` isn't an absolute or relative path. (cherry picked from commit a26ec542371652e1d774696e90016fd5b0b1c191) Added: clang/lib/Tooling/LocateToolCompilationDatabase.cpp clang/test/ClangScanDeps/resolve-executable-path.c Modified: clang/include/clang/Tooling/CompilationDatabase.h clang/lib/Tooling/CMakeLists.txt clang/test/ClangScanDeps/modules-extern-submodule.c clang/test/ClangScanDeps/modules-full-output-tu-order.c clang/test/ClangScanDeps/modules-has-include-umbrella-header.c clang/test/ClangScanDeps/modules-header-sharing.m clang/test/ClangScanDeps/modules-implementation-module-map.c clang/test/ClangScanDeps/modules-implementation-private.m clang/test/ClangScanDeps/modules-priv-fw-from-pub.m clang/tools/clang-scan-deps/ClangScanDeps.cpp Removed: diff --git a/clang/include/clang/Tooling/CompilationDatabase.h b/clang/include/clang/Tooling/CompilationDatabase.h index fee584acb48623..36fe0812ebe974 100644 --- a/clang/include/clang/Tooling/CompilationDatabase.h +++ b/clang/include/clang/Tooling/CompilationDatabase.h @@ -234,6 +234,12 @@ std::unique_ptr std::unique_ptr inferTargetAndDriverMode(std::unique_ptr Base); +/// Returns a wrapped CompilationDatabase that will transform argv[0] to an +/// absolute path, if it currently is a plain tool name, looking it up in +/// PATH. +std::unique_ptr +inferToolLocation(std::unique_ptr Base); + /// Returns a wrapped CompilationDatabase that will expand all rsp(response) /// files on commandline returned by underlying database. std::unique_ptr diff --git a/clang/lib/Tooling/CMakeLists.txt b/clang/lib/Tooling/CMakeLists.txt index 93a9e707a134cf..fc1f1f9f9d367e 100644 --- a/clang/lib/Tooling/CMakeLists.txt +++ b/clang/lib/Tooling/CMakeLists.txt @@ -25,6 +25,7 @@ add_clang_library(clangTooling GuessTargetAndModeCompilationDatabase.cpp InterpolatingCompilationDatabase.cpp JSONCompilationDatabase.cpp + LocateToolCompilationDatabase.cpp Refactoring.cpp RefactoringCallbacks.cpp StandaloneExecution.cpp diff --git a/clang/lib/Tooling/LocateToolCompilationDatabase.cpp b/clang/lib/Tooling/LocateToolCompilationDatabase.cpp new file mode 100644 index 00..033f69f3760c6d --- /dev/null +++ b/clang/lib/Tooling/LocateToolCompilationDatabase.cpp @@ -0,0 +1,71 @@ +//===- GuessTargetAndModeCompilationDatabase.cpp --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#include "clang/Tooling/CompilationDatabase.h" +#include "clang/Tooling/Tooling.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include + +namespace clang { +namespace tooling { + +namespace { +class LocationAdderDatabase : public CompilationDatabase { +public: + LocationAdderDatabase(std::unique_ptr Base) + : Base(std::move(Base)) { +assert(this->Base != nullptr); + } + + std::vector getAllFiles() const override { +return Base->getAllFiles(); + } + + std::vector getAllCompileCommands() const override { +return addLocation(Base->getAllCompileCommands()); + } + + std::vector + getCompileCommands(StringRef FilePath) const override { +return addLocation(Base->getCompileCommands(FilePath)); + } + +private: + std::vector + addLocation(std::vector Cmds) const { +for (auto &Cmd : Cmds) { + if (Cmd.CommandLine.empty()) +continue; + std::string &Driver = Cmd.CommandLine.front(); + // If the driver name already is absolute, we don't need to do anything. + if (llvm::sys::path::is_absolute(Driver)) +continue; + // If the name is a relative path, like bin/clang, we assume it's + // possible to resolve it and don't do anything about it either. + if (llvm::any_of(Driver, + [](char C) { return llvm::sys::path::is_separator(C); })) +continue; + auto Absolute = llvm::sys::findProgramByName(Driver); + // If we found it in path, update the entry in Cmd.CommandLine +
[llvm-branch-commits] [llvm] release/19.x: [LoopPeel] Fix LCSSA phi node invalidation (PR #109624)
https://github.com/tru updated https://github.com/llvm/llvm-project/pull/109624 >From aaa7027716ad347cda75865e99a2ff654bed6bf1 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 20 Sep 2024 16:57:46 +0200 Subject: [PATCH] [LoopPeel] Fix LCSSA phi node invalidation In the test case, the BECount of the second loop uses %load, but we only have an LCSSA phi node for %add, so that is what gets invalidated. Use the forgetLcssaPhiWithNewPredecessor() API instead, which will invalidate the roots of the expression instead. Fixes https://github.com/llvm/llvm-project/issues/109333. (cherry picked from commit 5bcc82d43388bb0daa122d5fe7ecda5eca27fc16) --- llvm/lib/Transforms/Utils/LoopPeel.cpp | 2 +- llvm/test/Transforms/LoopUnroll/pr109333.ll | 104 2 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/LoopUnroll/pr109333.ll diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index 5d7c0d947facc4..760f1619e030c3 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -859,7 +859,7 @@ static void cloneLoopBlocks( if (LatchInst && L->contains(LatchInst)) LatchVal = VMap[LatchVal]; PHI.addIncoming(LatchVal, cast(VMap[Edge.first])); - SE.forgetValue(&PHI); + SE.forgetLcssaPhiWithNewPredecessor(L, &PHI); } // LastValueMap is updated with the values for the current loop diff --git a/llvm/test/Transforms/LoopUnroll/pr109333.ll b/llvm/test/Transforms/LoopUnroll/pr109333.ll new file mode 100644 index 00..f7ac911a78207a --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/pr109333.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes="print,loop-unroll" -unroll-runtime < %s 2>/dev/null | FileCheck %s + +; Make sure we use %add.lcssa rather than %load when expanding the +; backedge taken count. + +define void @test(i1 %c, ptr %p) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT:br label %[[LOOP_1_PEEL_BEGIN:.*]] +; CHECK: [[LOOP_1_PEEL_BEGIN]]: +; CHECK-NEXT:br label %[[LOOP_1_PEEL:.*]] +; CHECK: [[LOOP_1_PEEL]]: +; CHECK-NEXT:[[LOAD_PEEL:%.*]] = load i64, ptr [[P]], align 8 +; CHECK-NEXT:[[ADD_PEEL:%.*]] = add i64 [[LOAD_PEEL]], 1 +; CHECK-NEXT:br i1 [[C]], label %[[IF:.*]], label %[[LOOP_1_PEEL_NEXT:.*]] +; CHECK: [[LOOP_1_PEEL_NEXT]]: +; CHECK-NEXT:br label %[[LOOP_1_PEEL_NEXT1:.*]] +; CHECK: [[LOOP_1_PEEL_NEXT1]]: +; CHECK-NEXT:br label %[[ENTRY_PEEL_NEWPH:.*]] +; CHECK: [[ENTRY_PEEL_NEWPH]]: +; CHECK-NEXT:br label %[[LOOP_1:.*]] +; CHECK: [[LOOP_1]]: +; CHECK-NEXT:[[LOAD:%.*]] = load i64, ptr [[P]], align 8 +; CHECK-NEXT:[[ADD:%.*]] = add i64 [[LOAD]], 1 +; CHECK-NEXT:br i1 [[C]], label %[[IF_LOOPEXIT:.*]], label %[[LOOP_1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[IF_LOOPEXIT]]: +; CHECK-NEXT:[[ADD_LCSSA_PH:%.*]] = phi i64 [ [[ADD]], %[[LOOP_1]] ] +; CHECK-NEXT:br label %[[IF]] +; CHECK: [[IF]]: +; CHECK-NEXT:[[ADD_LCSSA:%.*]] = phi i64 [ [[ADD_PEEL]], %[[LOOP_1_PEEL]] ], [ [[ADD_LCSSA_PH]], %[[IF_LOOPEXIT]] ] +; CHECK-NEXT:[[GEP:%.*]] = getelementptr i64, ptr [[P]], i64 [[ADD_LCSSA]] +; CHECK-NEXT:[[TMP0:%.*]] = shl i64 [[ADD_LCSSA]], 3 +; CHECK-NEXT:[[TMP1:%.*]] = lshr i64 [[TMP0]], 3 +; CHECK-NEXT:[[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT:[[XTRAITER:%.*]] = and i64 [[TMP2]], 7 +; CHECK-NEXT:[[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; CHECK-NEXT:br i1 [[LCMP_MOD]], label %[[LOOP_2_PROL_PREHEADER:.*]], label %[[LOOP_2_PROL_LOOPEXIT:.*]] +; CHECK: [[LOOP_2_PROL_PREHEADER]]: +; CHECK-NEXT:br label %[[LOOP_2_PROL:.*]] +; CHECK: [[LOOP_2_PROL]]: +; CHECK-NEXT:[[IV_PROL:%.*]] = phi ptr [ [[P]], %[[LOOP_2_PROL_PREHEADER]] ], [ [[IV_NEXT_PROL:%.*]], %[[LOOP_2_PROL]] ] +; CHECK-NEXT:[[PROL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_2_PROL_PREHEADER]] ], [ [[PROL_ITER_NEXT:%.*]], %[[LOOP_2_PROL]] ] +; CHECK-NEXT:[[IV_NEXT_PROL]] = getelementptr i8, ptr [[IV_PROL]], i64 8 +; CHECK-NEXT:[[ICMP_PROL:%.*]] = icmp eq ptr [[IV_PROL]], [[GEP]] +; CHECK-NEXT:[[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1 +; CHECK-NEXT:[[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]] +; CHECK-NEXT:br i1 [[PROL_ITER_CMP]], label %[[LOOP_2_PROL]], label %[[LOOP_2_PROL_LOOPEXIT_UNR_LCSSA:.*]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: [[LOOP_2_PROL_LOOPEXIT_UNR_LCSSA]]: +; CHECK-NEXT:[[IV_UNR_PH:%.*]] = phi ptr [ [[IV_NEXT_PROL]], %[[LOOP_2_PROL]] ] +; CHECK-NEXT:br label %[[LOOP_2_PROL_LOOPEXIT]] +; CHECK: [[LOOP_2_PROL_LOOPEXIT]]: +; CHECK-NEXT:[[IV_UNR:%.*]] = phi ptr [ [[P]], %[[IF]] ], [ [[IV_UNR_PH]], %[[LOOP_2_PROL_LOOPEXIT_UNR_LCSSA]] ] +;
[llvm-branch-commits] [llvm] aaa7027 - [LoopPeel] Fix LCSSA phi node invalidation
Author: Nikita Popov Date: 2024-10-01T08:52:08+02:00 New Revision: aaa7027716ad347cda75865e99a2ff654bed6bf1 URL: https://github.com/llvm/llvm-project/commit/aaa7027716ad347cda75865e99a2ff654bed6bf1 DIFF: https://github.com/llvm/llvm-project/commit/aaa7027716ad347cda75865e99a2ff654bed6bf1.diff LOG: [LoopPeel] Fix LCSSA phi node invalidation In the test case, the BECount of the second loop uses %load, but we only have an LCSSA phi node for %add, so that is what gets invalidated. Use the forgetLcssaPhiWithNewPredecessor() API instead, which will invalidate the roots of the expression instead. Fixes https://github.com/llvm/llvm-project/issues/109333. (cherry picked from commit 5bcc82d43388bb0daa122d5fe7ecda5eca27fc16) Added: llvm/test/Transforms/LoopUnroll/pr109333.ll Modified: llvm/lib/Transforms/Utils/LoopPeel.cpp Removed: diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index 5d7c0d947facc4..760f1619e030c3 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -859,7 +859,7 @@ static void cloneLoopBlocks( if (LatchInst && L->contains(LatchInst)) LatchVal = VMap[LatchVal]; PHI.addIncoming(LatchVal, cast(VMap[Edge.first])); - SE.forgetValue(&PHI); + SE.forgetLcssaPhiWithNewPredecessor(L, &PHI); } // LastValueMap is updated with the values for the current loop diff --git a/llvm/test/Transforms/LoopUnroll/pr109333.ll b/llvm/test/Transforms/LoopUnroll/pr109333.ll new file mode 100644 index 00..f7ac911a78207a --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/pr109333.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes="print,loop-unroll" -unroll-runtime < %s 2>/dev/null | FileCheck %s + +; Make sure we use %add.lcssa rather than %load when expanding the +; backedge taken count. + +define void @test(i1 %c, ptr %p) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT:br label %[[LOOP_1_PEEL_BEGIN:.*]] +; CHECK: [[LOOP_1_PEEL_BEGIN]]: +; CHECK-NEXT:br label %[[LOOP_1_PEEL:.*]] +; CHECK: [[LOOP_1_PEEL]]: +; CHECK-NEXT:[[LOAD_PEEL:%.*]] = load i64, ptr [[P]], align 8 +; CHECK-NEXT:[[ADD_PEEL:%.*]] = add i64 [[LOAD_PEEL]], 1 +; CHECK-NEXT:br i1 [[C]], label %[[IF:.*]], label %[[LOOP_1_PEEL_NEXT:.*]] +; CHECK: [[LOOP_1_PEEL_NEXT]]: +; CHECK-NEXT:br label %[[LOOP_1_PEEL_NEXT1:.*]] +; CHECK: [[LOOP_1_PEEL_NEXT1]]: +; CHECK-NEXT:br label %[[ENTRY_PEEL_NEWPH:.*]] +; CHECK: [[ENTRY_PEEL_NEWPH]]: +; CHECK-NEXT:br label %[[LOOP_1:.*]] +; CHECK: [[LOOP_1]]: +; CHECK-NEXT:[[LOAD:%.*]] = load i64, ptr [[P]], align 8 +; CHECK-NEXT:[[ADD:%.*]] = add i64 [[LOAD]], 1 +; CHECK-NEXT:br i1 [[C]], label %[[IF_LOOPEXIT:.*]], label %[[LOOP_1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[IF_LOOPEXIT]]: +; CHECK-NEXT:[[ADD_LCSSA_PH:%.*]] = phi i64 [ [[ADD]], %[[LOOP_1]] ] +; CHECK-NEXT:br label %[[IF]] +; CHECK: [[IF]]: +; CHECK-NEXT:[[ADD_LCSSA:%.*]] = phi i64 [ [[ADD_PEEL]], %[[LOOP_1_PEEL]] ], [ [[ADD_LCSSA_PH]], %[[IF_LOOPEXIT]] ] +; CHECK-NEXT:[[GEP:%.*]] = getelementptr i64, ptr [[P]], i64 [[ADD_LCSSA]] +; CHECK-NEXT:[[TMP0:%.*]] = shl i64 [[ADD_LCSSA]], 3 +; CHECK-NEXT:[[TMP1:%.*]] = lshr i64 [[TMP0]], 3 +; CHECK-NEXT:[[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT:[[XTRAITER:%.*]] = and i64 [[TMP2]], 7 +; CHECK-NEXT:[[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; CHECK-NEXT:br i1 [[LCMP_MOD]], label %[[LOOP_2_PROL_PREHEADER:.*]], label %[[LOOP_2_PROL_LOOPEXIT:.*]] +; CHECK: [[LOOP_2_PROL_PREHEADER]]: +; CHECK-NEXT:br label %[[LOOP_2_PROL:.*]] +; CHECK: [[LOOP_2_PROL]]: +; CHECK-NEXT:[[IV_PROL:%.*]] = phi ptr [ [[P]], %[[LOOP_2_PROL_PREHEADER]] ], [ [[IV_NEXT_PROL:%.*]], %[[LOOP_2_PROL]] ] +; CHECK-NEXT:[[PROL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_2_PROL_PREHEADER]] ], [ [[PROL_ITER_NEXT:%.*]], %[[LOOP_2_PROL]] ] +; CHECK-NEXT:[[IV_NEXT_PROL]] = getelementptr i8, ptr [[IV_PROL]], i64 8 +; CHECK-NEXT:[[ICMP_PROL:%.*]] = icmp eq ptr [[IV_PROL]], [[GEP]] +; CHECK-NEXT:[[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1 +; CHECK-NEXT:[[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]] +; CHECK-NEXT:br i1 [[PROL_ITER_CMP]], label %[[LOOP_2_PROL]], label %[[LOOP_2_PROL_LOOPEXIT_UNR_LCSSA:.*]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: [[LOOP_2_PROL_LOOPEXIT_UNR_LCSSA]]: +; CHECK-NEXT:[[IV_UNR_PH:%.*]] = phi ptr [ [[IV_NEXT_PROL]], %[[LOOP_2_PROL]] ] +; CHECK-NEXT:br label %[[LOOP_2_PROL_LOOPEXIT]] +; CHECK: [[LOOP_2_PROL_LOOPEXIT]]: +; CHECK-NEXT:[[IV_UNR:%.*]] = phi ptr [ [[P]], %[[IF]] ], [ [[IV_UNR_PH]]
[llvm-branch-commits] [llvm] release/19.x: [LoongArch] Eliminate the redundant sign extension of division (#107971) (PR #109125)
github-actions[bot] wrote: @heiher (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. https://github.com/llvm/llvm-project/pull/109125 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] 997b66e - [clang-scan-deps] Don't inspect Args[0] as an option (#109050)
Author: Martin Storsjö Date: 2024-10-01T08:53:03+02:00 New Revision: 997b66e566886b8a395b852db46e7930f757b818 URL: https://github.com/llvm/llvm-project/commit/997b66e566886b8a395b852db46e7930f757b818 DIFF: https://github.com/llvm/llvm-project/commit/997b66e566886b8a395b852db46e7930f757b818.diff LOG: [clang-scan-deps] Don't inspect Args[0] as an option (#109050) Since a26ec542371652e1d774696e90016fd5b0b1c191, we expand the executable name to an absolute path, if it isn't already one, if found in path. This broke a couple tests in some environments; when the clang workdir resides in a path under e.g. /opt. Tests that only use a tool name like "clang-cl" would get expanded to the absolute path in the build tree. The loop for finding the last "-o" like option for clang-cl command lines would inspect all arguments, including Args[0] which is the executable name itself. As an /opt path matches Arg.starts_with("/o"), this would get detected as an object file output name in cases where there was no other explicit output argument. Thus, this fixes those tests in workdirs under e.g. /opt. (cherry picked from commit cead9044a995910306e2e64b426fcc8042d7e0ef) Added: Modified: clang/tools/clang-scan-deps/ClangScanDeps.cpp Removed: diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index 0f581e73cdfe4b..867df19c863fe5 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -837,7 +837,12 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { // Reverse scan, starting at the end or at the element before "--". auto R = std::make_reverse_iterator(FlagsEnd); - for (auto I = R, E = Args.rend(); I != E; ++I) { + auto E = Args.rend(); + // Don't include Args[0] in the iteration; that's the executable, not + // an option. + if (E != R) +E--; + for (auto I = R; I != E; ++I) { StringRef Arg = *I; if (ClangCLMode) { // Ignore arguments that are preceded by "-Xclang". ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] release/19.x: [clang-scan-deps] Don't inspect Args[0] as an option (#109050) (PR #109865)
Martin =?utf-8?q?Storsj=C3=B6?= , Martin =?utf-8?q?Storsj=C3=B6?= , Martin =?utf-8?q?Storsj=C3=B6?= Message-ID: In-Reply-To: https://github.com/tru closed https://github.com/llvm/llvm-project/pull/109865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [LoopPeel] Fix LCSSA phi node invalidation (PR #109624)
github-actions[bot] wrote: @nikic (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. https://github.com/llvm/llvm-project/pull/109624 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [LoopPeel] Fix LCSSA phi node invalidation (PR #109624)
https://github.com/tru closed https://github.com/llvm/llvm-project/pull/109624 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] 149bfdd - [Clang][CodeGen] Fix type for atomic float incdec operators (#107075)
Author: Yingwei Zheng Date: 2024-10-01T08:47:00+02:00 New Revision: 149bfdd61c961edbf49c2ea7fadf9d3c1a79a55e URL: https://github.com/llvm/llvm-project/commit/149bfdd61c961edbf49c2ea7fadf9d3c1a79a55e DIFF: https://github.com/llvm/llvm-project/commit/149bfdd61c961edbf49c2ea7fadf9d3c1a79a55e.diff LOG: [Clang][CodeGen] Fix type for atomic float incdec operators (#107075) `llvm::ConstantFP::get(llvm::LLVMContext&, APFloat(float))` always returns a f32 constant. Fix https://github.com/llvm/llvm-project/issues/107054. Added: Modified: clang/lib/CodeGen/CGExprScalar.cpp clang/test/CodeGen/X86/x86-atomic-double.c clang/test/CodeGen/X86/x86-atomic-long_double.c Removed: diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index a17d68424bbce5..6e212e74676e8d 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -2833,18 +2833,22 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV, llvm::AtomicOrdering::SequentiallyConsistent); return isPre ? Builder.CreateBinOp(op, old, amt) : old; } -// Special case for atomic increment/decrement on floats +// Special case for atomic increment/decrement on floats. +// Bail out non-power-of-2-sized floating point types (e.g., x86_fp80). if (type->isFloatingType()) { - llvm::AtomicRMWInst::BinOp aop = - isInc ? llvm::AtomicRMWInst::FAdd : llvm::AtomicRMWInst::FSub; - llvm::Instruction::BinaryOps op = - isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub; - llvm::Value *amt = llvm::ConstantFP::get( - VMContext, llvm::APFloat(static_cast(1.0))); - llvm::Value *old = - Builder.CreateAtomicRMW(aop, LV.getAddress(), amt, - llvm::AtomicOrdering::SequentiallyConsistent); - return isPre ? Builder.CreateBinOp(op, old, amt) : old; + llvm::Type *Ty = ConvertType(type); + if (llvm::has_single_bit(Ty->getScalarSizeInBits())) { +llvm::AtomicRMWInst::BinOp aop = +isInc ? llvm::AtomicRMWInst::FAdd : llvm::AtomicRMWInst::FSub; +llvm::Instruction::BinaryOps op = +isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub; +llvm::Value *amt = llvm::ConstantFP::get(Ty, 1.0); +llvm::AtomicRMWInst *old = Builder.CreateAtomicRMW( +aop, LV.getAddress(), amt, +llvm::AtomicOrdering::SequentiallyConsistent); + +return isPre ? Builder.CreateBinOp(op, old, amt) : old; + } } value = EmitLoadOfLValue(LV, E->getExprLoc()); input = value; diff --git a/clang/test/CodeGen/X86/x86-atomic-double.c b/clang/test/CodeGen/X86/x86-atomic-double.c index 2354c89cc2b170..09c8f70c3db854 100644 --- a/clang/test/CodeGen/X86/x86-atomic-double.c +++ b/clang/test/CodeGen/X86/x86-atomic-double.c @@ -6,20 +6,14 @@ // X64-LABEL: define dso_local double @test_double_post_inc( // X64-SAME: ) #[[ATTR0:[0-9]+]] { // X64-NEXT: entry: -// X64-NEXT:[[RETVAL:%.*]] = alloca double, align 8 -// X64-NEXT:[[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.00e+00 seq_cst, align 8 -// X64-NEXT:store float [[TMP0]], ptr [[RETVAL]], align 8 -// X64-NEXT:[[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8 -// X64-NEXT:ret double [[TMP1]] +// X64-NEXT:[[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, double 1.00e+00 seq_cst, align 8 +// X64-NEXT:ret double [[TMP0]] // // X86-LABEL: define dso_local double @test_double_post_inc( // X86-SAME: ) #[[ATTR0:[0-9]+]] { // X86-NEXT: entry: -// X86-NEXT:[[RETVAL:%.*]] = alloca double, align 4 -// X86-NEXT:[[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.00e+00 seq_cst, align 8 -// X86-NEXT:store float [[TMP0]], ptr [[RETVAL]], align 4 -// X86-NEXT:[[TMP1:%.*]] = load double, ptr [[RETVAL]], align 4 -// X86-NEXT:ret double [[TMP1]] +// X86-NEXT:[[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, double 1.00e+00 seq_cst, align 8 +// X86-NEXT:ret double [[TMP0]] // double test_double_post_inc() { @@ -30,20 +24,14 @@ double test_double_post_inc() // X64-LABEL: define dso_local double @test_double_post_dc( // X64-SAME: ) #[[ATTR0]] { // X64-NEXT: entry: -// X64-NEXT:[[RETVAL:%.*]] = alloca double, align 8 -// X64-NEXT:[[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, float 1.00e+00 seq_cst, align 8 -// X64-NEXT:store float [[TMP0]], ptr [[RETVAL]], align 8 -// X64-NEXT:[[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8 -// X64-NEXT:ret double [[TMP1]] +// X64-NEXT:[[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, double 1.00e+00 seq_cst, align 8 +// X64-NEXT:ret double [[TMP0]] // // X86-LABEL: define dso_local double @test_doub
[llvm-branch-commits] [llvm] b3731b3 - [DAGCombiner] cache negative result from getMergeStoreCandidates() (#106949)
Author: Princeton Ferro Date: 2024-10-01T08:47:51+02:00 New Revision: b3731b36421e23737be2b4785700267b96c3241f URL: https://github.com/llvm/llvm-project/commit/b3731b36421e23737be2b4785700267b96c3241f DIFF: https://github.com/llvm/llvm-project/commit/b3731b36421e23737be2b4785700267b96c3241f.diff LOG: [DAGCombiner] cache negative result from getMergeStoreCandidates() (#106949) Cache negative search result from getStoreMergeCandidates() so that mergeConsecutiveStores() does not iterate quadratically over a potentially long sequence of unmergeable stores. (cherry picked from commit 8f77d37f256809766fd83a09c6d144b785e9165a) Added: Modified: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Removed: diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 71cdec91e5f67a..7b1f1dc40211d5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -191,6 +191,11 @@ namespace { // AA - Used for DAG load/store alias analysis. AliasAnalysis *AA; +/// This caches all chains that have already been processed in +/// DAGCombiner::getStoreMergeCandidates() and found to have no mergeable +/// stores candidates. +SmallPtrSet ChainsWithoutMergeableStores; + /// When an instruction is simplified, add all users of the instruction to /// the work lists because they might get more simplified now. void AddUsersToWorklist(SDNode *N) { @@ -776,11 +781,10 @@ namespace { bool UseTrunc); /// This is a helper function for mergeConsecutiveStores. Stores that -/// potentially may be merged with St are placed in StoreNodes. RootNode is -/// a chain predecessor to all store candidates. -void getStoreMergeCandidates(StoreSDNode *St, - SmallVectorImpl &StoreNodes, - SDNode *&Root); +/// potentially may be merged with St are placed in StoreNodes. On success, +/// returns a chain predecessor to all store candidates. +SDNode *getStoreMergeCandidates(StoreSDNode *St, +SmallVectorImpl &StoreNodes); /// Helper function for mergeConsecutiveStores. Checks if candidate stores /// have indirect dependency through their operands. RootNode is the @@ -1782,6 +1786,9 @@ void DAGCombiner::Run(CombineLevel AtLevel) { ++NodesCombined; +// Invalidate cached info. +ChainsWithoutMergeableStores.clear(); + // If we get back the same node we passed in, rather than a new node or // zero, we know that the node must have defined multiple values and // CombineTo was used. Since CombineTo takes care of the worklist @@ -20372,15 +20379,15 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts( return true; } -void DAGCombiner::getStoreMergeCandidates( -StoreSDNode *St, SmallVectorImpl &StoreNodes, -SDNode *&RootNode) { +SDNode * +DAGCombiner::getStoreMergeCandidates(StoreSDNode *St, + SmallVectorImpl &StoreNodes) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. We must have a base and an offset. Do not handle stores to undef // base pointers. BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); if (!BasePtr.getBase().getNode() || BasePtr.getBase().isUndef()) -return; +return nullptr; SDValue Val = peekThroughBitcasts(St->getValue()); StoreSource StoreSrc = getStoreSource(Val); @@ -20396,14 +20403,14 @@ void DAGCombiner::getStoreMergeCandidates( LoadVT = Ld->getMemoryVT(); // Load and store should be the same type. if (MemVT != LoadVT) - return; + return nullptr; // Loads must only have one use. if (!Ld->hasNUsesOfValue(1, 0)) - return; + return nullptr; // The memory operands must not be volatile/indexed/atomic. // TODO: May be able to relax for unordered atomics (see D66309) if (!Ld->isSimple() || Ld->isIndexed()) - return; + return nullptr; } auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, int64_t &Offset) -> bool { @@ -20471,6 +20478,27 @@ void DAGCombiner::getStoreMergeCandidates( return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); }; + // We are looking for a root node which is an ancestor to all mergable + // stores. We search up through a load, to our root and then down + // through all children. For instance we will find Store{1,2,3} if + // St is Store1, Store2. or Store3 where the root is not a load + // which always true for nonvolatile ops. TODO: Expand + // the search to find all valid candidates through multiple layers of loads. + // + // Root + // |---|---| + // LoadLoadStore3 + // | | + // Sto
[llvm-branch-commits] [clang] Backport "[Clang][CodeGen] Fix type for atomic float incdec operators (#107075)" (PR #107184)
github-actions[bot] wrote: @dtcxzyw (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. https://github.com/llvm/llvm-project/pull/107184 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [DAGCombiner] cache negative result from getMergeStoreCandidates() (#106949) (PR #108397)
https://github.com/tru updated https://github.com/llvm/llvm-project/pull/108397 >From b3731b36421e23737be2b4785700267b96c3241f Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Wed, 4 Sep 2024 07:18:53 -0700 Subject: [PATCH] [DAGCombiner] cache negative result from getMergeStoreCandidates() (#106949) Cache negative search result from getStoreMergeCandidates() so that mergeConsecutiveStores() does not iterate quadratically over a potentially long sequence of unmergeable stores. (cherry picked from commit 8f77d37f256809766fd83a09c6d144b785e9165a) --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 83 --- 1 file changed, 51 insertions(+), 32 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 71cdec91e5f67a..7b1f1dc40211d5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -191,6 +191,11 @@ namespace { // AA - Used for DAG load/store alias analysis. AliasAnalysis *AA; +/// This caches all chains that have already been processed in +/// DAGCombiner::getStoreMergeCandidates() and found to have no mergeable +/// stores candidates. +SmallPtrSet ChainsWithoutMergeableStores; + /// When an instruction is simplified, add all users of the instruction to /// the work lists because they might get more simplified now. void AddUsersToWorklist(SDNode *N) { @@ -776,11 +781,10 @@ namespace { bool UseTrunc); /// This is a helper function for mergeConsecutiveStores. Stores that -/// potentially may be merged with St are placed in StoreNodes. RootNode is -/// a chain predecessor to all store candidates. -void getStoreMergeCandidates(StoreSDNode *St, - SmallVectorImpl &StoreNodes, - SDNode *&Root); +/// potentially may be merged with St are placed in StoreNodes. On success, +/// returns a chain predecessor to all store candidates. +SDNode *getStoreMergeCandidates(StoreSDNode *St, +SmallVectorImpl &StoreNodes); /// Helper function for mergeConsecutiveStores. Checks if candidate stores /// have indirect dependency through their operands. RootNode is the @@ -1782,6 +1786,9 @@ void DAGCombiner::Run(CombineLevel AtLevel) { ++NodesCombined; +// Invalidate cached info. +ChainsWithoutMergeableStores.clear(); + // If we get back the same node we passed in, rather than a new node or // zero, we know that the node must have defined multiple values and // CombineTo was used. Since CombineTo takes care of the worklist @@ -20372,15 +20379,15 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts( return true; } -void DAGCombiner::getStoreMergeCandidates( -StoreSDNode *St, SmallVectorImpl &StoreNodes, -SDNode *&RootNode) { +SDNode * +DAGCombiner::getStoreMergeCandidates(StoreSDNode *St, + SmallVectorImpl &StoreNodes) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. We must have a base and an offset. Do not handle stores to undef // base pointers. BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); if (!BasePtr.getBase().getNode() || BasePtr.getBase().isUndef()) -return; +return nullptr; SDValue Val = peekThroughBitcasts(St->getValue()); StoreSource StoreSrc = getStoreSource(Val); @@ -20396,14 +20403,14 @@ void DAGCombiner::getStoreMergeCandidates( LoadVT = Ld->getMemoryVT(); // Load and store should be the same type. if (MemVT != LoadVT) - return; + return nullptr; // Loads must only have one use. if (!Ld->hasNUsesOfValue(1, 0)) - return; + return nullptr; // The memory operands must not be volatile/indexed/atomic. // TODO: May be able to relax for unordered atomics (see D66309) if (!Ld->isSimple() || Ld->isIndexed()) - return; + return nullptr; } auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, int64_t &Offset) -> bool { @@ -20471,6 +20478,27 @@ void DAGCombiner::getStoreMergeCandidates( return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); }; + // We are looking for a root node which is an ancestor to all mergable + // stores. We search up through a load, to our root and then down + // through all children. For instance we will find Store{1,2,3} if + // St is Store1, Store2. or Store3 where the root is not a load + // which always true for nonvolatile ops. TODO: Expand + // the search to find all valid candidates through multiple layers of loads. + // + // Root + // |---|---| + // LoadLoadStore3 + // | | + // Store1 Store2 + // + // FIXME: We should be able to climb and + // descend TokenFactors to find candidates as well. + + S
[llvm-branch-commits] [llvm] release/19.x: [DAGCombiner] cache negative result from getMergeStoreCandidates() (#106949) (PR #108397)
https://github.com/tru closed https://github.com/llvm/llvm-project/pull/108397 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [DAGCombiner] cache negative result from getMergeStoreCandidates() (#106949) (PR #108397)
github-actions[bot] wrote: @nikic (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. https://github.com/llvm/llvm-project/pull/108397 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] 8a25c60 - [libc++] Disable the clang-tidy checks to get CI back (#109989)
Author: Louis Dionne Date: 2024-09-26T16:00:43-04:00 New Revision: 8a25c601eb64bcdb7c6c74bee52655468dfdd91b URL: https://github.com/llvm/llvm-project/commit/8a25c601eb64bcdb7c6c74bee52655468dfdd91b DIFF: https://github.com/llvm/llvm-project/commit/8a25c601eb64bcdb7c6c74bee52655468dfdd91b.diff LOG: [libc++] Disable the clang-tidy checks to get CI back (#109989) The CI has been a complete mess for the past week, and the only thing preventing it from being back is the Clang tidy checks. Disable them (as a total hack) to get CI back. (cherry picked from commit 78c6506543dee13c9335edc5c85bc73c4853fbd7) Added: Modified: libcxx/test/tools/clang_tidy_checks/CMakeLists.txt Removed: diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt index f0289dc44c6625..125b2184a49ea6 100644 --- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt +++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt @@ -1,3 +1,5 @@ +# TODO: Re-enable the tests once the CI is back under control +return() # The find_package changes these variables. This leaves the build in an odd # state. Calling cmake a second time tries to write site config information in ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [SDAG] Honor signed arguments in floating point libcalls (#109134) (PR #109920)
https://github.com/tru updated https://github.com/llvm/llvm-project/pull/109920 >From 8679d1b51bd91d638ac3babba03a404e4031f9ea Mon Sep 17 00:00:00 2001 From: Timothy Pearson <162513562+tpearson-...@users.noreply.github.com> Date: Wed, 25 Sep 2024 02:09:50 -0500 Subject: [PATCH] [SDAG] Honor signed arguments in floating point libcalls (#109134) In ExpandFPLibCall, an assumption is made that all floating point libcalls that take integer arguments use unsigned integers. In the case of ldexp and frexp, this assumption is incorrect, leading to miscompilation and subsequent target-dependent incorrect operation. Indicate that ldexp and frexp utilize signed arguments in ExpandFPLibCall. Fixes #108904 Signed-off-by: Timothy Pearson (cherry picked from commit 90c14748638f1e10e31173b145fdbb5c4529c922) --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 3 +- llvm/test/CodeGen/PowerPC/ldexp-libcall.ll| 4 +- llvm/test/CodeGen/PowerPC/ldexp.ll| 36 ++ .../PowerPC/negative-integer-fp-libcall.ll| 26 +++ .../X86/fold-int-pow2-with-fmul-or-fdiv.ll| 69 --- 5 files changed, 96 insertions(+), 42 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/negative-integer-fp-libcall.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 7f5b46af01c62f..4b25f553ffae91 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2190,7 +2190,8 @@ void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node, Results.push_back(Tmp.first); Results.push_back(Tmp.second); } else { -SDValue Tmp = ExpandLibCall(LC, Node, false).first; +bool IsSignedArgument = Node->getOpcode() == ISD::FLDEXP; +SDValue Tmp = ExpandLibCall(LC, Node, IsSignedArgument).first; Results.push_back(Tmp); } } diff --git a/llvm/test/CodeGen/PowerPC/ldexp-libcall.ll b/llvm/test/CodeGen/PowerPC/ldexp-libcall.ll index 6144a9d9203651..e531516c37e87e 100644 --- a/llvm/test/CodeGen/PowerPC/ldexp-libcall.ll +++ b/llvm/test/CodeGen/PowerPC/ldexp-libcall.ll @@ -10,7 +10,7 @@ define float @call_ldexpf(float %a, i32 %b) { ; CHECK-NEXT:std r0, 48(r1) ; CHECK-NEXT:.cfi_def_cfa_offset 32 ; CHECK-NEXT:.cfi_offset lr, 16 -; CHECK-NEXT:clrldi r4, r4, 32 +; CHECK-NEXT:extsw r4, r4 ; CHECK-NEXT:bl ldexpf ; CHECK-NEXT:nop ; CHECK-NEXT:addi r1, r1, 32 @@ -29,7 +29,7 @@ define double @call_ldexp(double %a, i32 %b) { ; CHECK-NEXT:std r0, 48(r1) ; CHECK-NEXT:.cfi_def_cfa_offset 32 ; CHECK-NEXT:.cfi_offset lr, 16 -; CHECK-NEXT:clrldi r4, r4, 32 +; CHECK-NEXT:extsw r4, r4 ; CHECK-NEXT:bl ldexp ; CHECK-NEXT:nop ; CHECK-NEXT:addi r1, r1, 32 diff --git a/llvm/test/CodeGen/PowerPC/ldexp.ll b/llvm/test/CodeGen/PowerPC/ldexp.ll index 151df6096b30bd..ffc826cc86de59 100644 --- a/llvm/test/CodeGen/PowerPC/ldexp.ll +++ b/llvm/test/CodeGen/PowerPC/ldexp.ll @@ -57,22 +57,24 @@ define <2 x float> @ldexp_v2f32(<2 x float> %val, <2 x i32> %exp) { ; CHECK-NEXT:.cfi_offset v29, -48 ; CHECK-NEXT:.cfi_offset v30, -32 ; CHECK-NEXT:.cfi_offset v31, -16 -; CHECK-NEXT:xxsldwi vs0, v2, v2, 3 ; CHECK-NEXT:li r3, 0 +; CHECK-NEXT:xxsldwi vs0, v2, v2, 3 ; CHECK-NEXT:stxv v29, 32(r1) # 16-byte Folded Spill ; CHECK-NEXT:xscvspdpn f1, vs0 -; CHECK-NEXT:vextuwrx r4, r3, v3 +; CHECK-NEXT:vextuwrx r3, r3, v3 ; CHECK-NEXT:stxv v30, 48(r1) # 16-byte Folded Spill ; CHECK-NEXT:stxv v31, 64(r1) # 16-byte Folded Spill +; CHECK-NEXT:extsw r4, r3 ; CHECK-NEXT:vmr v31, v3 ; CHECK-NEXT:vmr v30, v2 ; CHECK-NEXT:bl ldexpf ; CHECK-NEXT:nop -; CHECK-NEXT:xxswapd vs0, v30 ; CHECK-NEXT:li r3, 4 +; CHECK-NEXT:xxswapd vs0, v30 ; CHECK-NEXT:xscvdpspn v29, f1 ; CHECK-NEXT:xscvspdpn f1, vs0 -; CHECK-NEXT:vextuwrx r4, r3, v31 +; CHECK-NEXT:vextuwrx r3, r3, v31 +; CHECK-NEXT:extsw r4, r3 ; CHECK-NEXT:bl ldexpf ; CHECK-NEXT:nop ; CHECK-NEXT:xscvdpspn vs0, f1 @@ -100,35 +102,39 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { ; CHECK-NEXT:.cfi_offset v29, -48 ; CHECK-NEXT:.cfi_offset v30, -32 ; CHECK-NEXT:.cfi_offset v31, -16 -; CHECK-NEXT:li r3, 12 -; CHECK-NEXT:xscvspdpn f1, v2 +; CHECK-NEXT:li r3, 4 +; CHECK-NEXT:xxswapd vs0, v2 ; CHECK-NEXT:stxv v28, 32(r1) # 16-byte Folded Spill +; CHECK-NEXT:xscvspdpn f1, vs0 +; CHECK-NEXT:vextuwrx r3, r3, v3 ; CHECK-NEXT:stxv v29, 48(r1) # 16-byte Folded Spill ; CHECK-NEXT:stxv v30, 64(r1) # 16-byte Folded Spill ; CHECK-NEXT:stxv v31, 80(r1) # 16-byte Folded Spill ; CHECK-NEXT:vmr v31, v3 +; CHECK-NEXT:extsw r4, r3 ; CHECK-NEXT:vmr v30, v2 -; CHECK-NEXT:vextuwrx r4, r3, v3 ; CHECK-NEXT:bl ldexpf ; CHECK-NEXT:nop -; CHECK-NEXT:xxswapd vs0, v30 -; CHECK-NEXT:li r3, 4 +; CHECK-NEXT:
[llvm-branch-commits] [llvm] 8679d1b - [SDAG] Honor signed arguments in floating point libcalls (#109134)
Author: Timothy Pearson Date: 2024-10-01T08:55:02+02:00 New Revision: 8679d1b51bd91d638ac3babba03a404e4031f9ea URL: https://github.com/llvm/llvm-project/commit/8679d1b51bd91d638ac3babba03a404e4031f9ea DIFF: https://github.com/llvm/llvm-project/commit/8679d1b51bd91d638ac3babba03a404e4031f9ea.diff LOG: [SDAG] Honor signed arguments in floating point libcalls (#109134) In ExpandFPLibCall, an assumption is made that all floating point libcalls that take integer arguments use unsigned integers. In the case of ldexp and frexp, this assumption is incorrect, leading to miscompilation and subsequent target-dependent incorrect operation. Indicate that ldexp and frexp utilize signed arguments in ExpandFPLibCall. Fixes #108904 Signed-off-by: Timothy Pearson (cherry picked from commit 90c14748638f1e10e31173b145fdbb5c4529c922) Added: llvm/test/CodeGen/PowerPC/negative-integer-fp-libcall.ll Modified: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp llvm/test/CodeGen/PowerPC/ldexp-libcall.ll llvm/test/CodeGen/PowerPC/ldexp.ll llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll Removed: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 7f5b46af01c62f..4b25f553ffae91 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2190,7 +2190,8 @@ void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node, Results.push_back(Tmp.first); Results.push_back(Tmp.second); } else { -SDValue Tmp = ExpandLibCall(LC, Node, false).first; +bool IsSignedArgument = Node->getOpcode() == ISD::FLDEXP; +SDValue Tmp = ExpandLibCall(LC, Node, IsSignedArgument).first; Results.push_back(Tmp); } } diff --git a/llvm/test/CodeGen/PowerPC/ldexp-libcall.ll b/llvm/test/CodeGen/PowerPC/ldexp-libcall.ll index 6144a9d9203651..e531516c37e87e 100644 --- a/llvm/test/CodeGen/PowerPC/ldexp-libcall.ll +++ b/llvm/test/CodeGen/PowerPC/ldexp-libcall.ll @@ -10,7 +10,7 @@ define float @call_ldexpf(float %a, i32 %b) { ; CHECK-NEXT:std r0, 48(r1) ; CHECK-NEXT:.cfi_def_cfa_offset 32 ; CHECK-NEXT:.cfi_offset lr, 16 -; CHECK-NEXT:clrldi r4, r4, 32 +; CHECK-NEXT:extsw r4, r4 ; CHECK-NEXT:bl ldexpf ; CHECK-NEXT:nop ; CHECK-NEXT:addi r1, r1, 32 @@ -29,7 +29,7 @@ define double @call_ldexp(double %a, i32 %b) { ; CHECK-NEXT:std r0, 48(r1) ; CHECK-NEXT:.cfi_def_cfa_offset 32 ; CHECK-NEXT:.cfi_offset lr, 16 -; CHECK-NEXT:clrldi r4, r4, 32 +; CHECK-NEXT:extsw r4, r4 ; CHECK-NEXT:bl ldexp ; CHECK-NEXT:nop ; CHECK-NEXT:addi r1, r1, 32 diff --git a/llvm/test/CodeGen/PowerPC/ldexp.ll b/llvm/test/CodeGen/PowerPC/ldexp.ll index 151df6096b30bd..ffc826cc86de59 100644 --- a/llvm/test/CodeGen/PowerPC/ldexp.ll +++ b/llvm/test/CodeGen/PowerPC/ldexp.ll @@ -57,22 +57,24 @@ define <2 x float> @ldexp_v2f32(<2 x float> %val, <2 x i32> %exp) { ; CHECK-NEXT:.cfi_offset v29, -48 ; CHECK-NEXT:.cfi_offset v30, -32 ; CHECK-NEXT:.cfi_offset v31, -16 -; CHECK-NEXT:xxsldwi vs0, v2, v2, 3 ; CHECK-NEXT:li r3, 0 +; CHECK-NEXT:xxsldwi vs0, v2, v2, 3 ; CHECK-NEXT:stxv v29, 32(r1) # 16-byte Folded Spill ; CHECK-NEXT:xscvspdpn f1, vs0 -; CHECK-NEXT:vextuwrx r4, r3, v3 +; CHECK-NEXT:vextuwrx r3, r3, v3 ; CHECK-NEXT:stxv v30, 48(r1) # 16-byte Folded Spill ; CHECK-NEXT:stxv v31, 64(r1) # 16-byte Folded Spill +; CHECK-NEXT:extsw r4, r3 ; CHECK-NEXT:vmr v31, v3 ; CHECK-NEXT:vmr v30, v2 ; CHECK-NEXT:bl ldexpf ; CHECK-NEXT:nop -; CHECK-NEXT:xxswapd vs0, v30 ; CHECK-NEXT:li r3, 4 +; CHECK-NEXT:xxswapd vs0, v30 ; CHECK-NEXT:xscvdpspn v29, f1 ; CHECK-NEXT:xscvspdpn f1, vs0 -; CHECK-NEXT:vextuwrx r4, r3, v31 +; CHECK-NEXT:vextuwrx r3, r3, v31 +; CHECK-NEXT:extsw r4, r3 ; CHECK-NEXT:bl ldexpf ; CHECK-NEXT:nop ; CHECK-NEXT:xscvdpspn vs0, f1 @@ -100,35 +102,39 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { ; CHECK-NEXT:.cfi_offset v29, -48 ; CHECK-NEXT:.cfi_offset v30, -32 ; CHECK-NEXT:.cfi_offset v31, -16 -; CHECK-NEXT:li r3, 12 -; CHECK-NEXT:xscvspdpn f1, v2 +; CHECK-NEXT:li r3, 4 +; CHECK-NEXT:xxswapd vs0, v2 ; CHECK-NEXT:stxv v28, 32(r1) # 16-byte Folded Spill +; CHECK-NEXT:xscvspdpn f1, vs0 +; CHECK-NEXT:vextuwrx r3, r3, v3 ; CHECK-NEXT:stxv v29, 48(r1) # 16-byte Folded Spill ; CHECK-NEXT:stxv v30, 64(r1) # 16-byte Folded Spill ; CHECK-NEXT:stxv v31, 80(r1) # 16-byte Folded Spill ; CHECK-NEXT:vmr v31, v3 +; CHECK-NEXT:extsw r4, r3 ; CHECK-NEXT:vmr v30, v2 -; CHECK-NEXT:vextuwrx r4, r3, v3 ; CHECK-NEXT:bl ldexpf ; CHECK-NEXT:nop -; CHECK-NEXT:xxswapd vs0, v30 -; CHECK-NEXT:li r3, 4 +; CHECK-NEXT:li r3, 12 ; CHECK-NEXT:
[llvm-branch-commits] [llvm] release/19.x: [SDAG] Honor signed arguments in floating point libcalls (#109134) (PR #109920)
https://github.com/tru closed https://github.com/llvm/llvm-project/pull/109920 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [SDAG] Honor signed arguments in floating point libcalls (#109134) (PR #109920)
github-actions[bot] wrote: @nikic (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. https://github.com/llvm/llvm-project/pull/109920 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits