llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Fabian Ritter (ritter-x2a) <details> <summary>Changes</summary> This patch changes the memset lowering to match the optimized memcpy lowering. The memset lowering now queries TTI.getMemcpyLoopLoweringType for a preferred memory access type. If that type is larger than a byte, the memset is lowered into two loops: a main loop that stores a sufficiently wide vector splat of the SetValue with the preferred memory access type and a residual loop that covers the remaining bytes individually. If the memset size is statically known, the residual loop is replaced by a sequence of stores. This improves memset performance on gfx1030 (AMDGPU) in microbenchmarks by around 7-20x. I'm planning similar treatment for memset.pattern as a follow-up PR. For SWDEV-543208. --- Patch is 343.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169040.diff 17 Files Affected: - (modified) llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h (+2-1) - (modified) llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp (+4-2) - (modified) llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp (+2-1) - (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+2-1) - (modified) llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp (+1-1) - (modified) llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp (+7-4) - (modified) llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp (+197-7) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll (+103-11) - (modified) llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll (+686-90) - (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll (+218-116) - (modified) llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll (+15-36) - (modified) llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll (+55-13) - (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+1616) - (added) llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll (+1900) - (modified) llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll (+4-4) - (modified) llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll (+2-2) - (modified) llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-inline-non-constant-len.ll (+12-12) ``````````diff diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h index d4e72a60fc1ea..8924b8b1e6e54 100644 --- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h +++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h @@ -59,7 +59,8 @@ LLVM_ABI bool expandMemMoveAsLoop(MemMoveInst *MemMove, const TargetTransformInfo &TTI); /// Expand \p MemSet as a loop. \p MemSet is not deleted. -LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet); +LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet, + const TargetTransformInfo &TTI); /// Expand \p MemSetPattern as a loop. \p MemSet is not deleted. LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet); diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index d738dc4eea36d..88e2bb81f9e3b 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -369,7 +369,7 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses( canEmitLibcall(TM, ParentFunc, RTLIB::MEMSET)) break; - expandMemSetAsLoop(Memset); + expandMemSetAsLoop(Memset, TTI); Changed = true; Memset->eraseFromParent(); } @@ -384,7 +384,9 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses( if (isa<ConstantInt>(Memset->getLength())) break; - expandMemSetAsLoop(Memset); + Function *ParentFunc = Memset->getFunction(); + const TargetTransformInfo &TTI = LookupTTI(*ParentFunc); + expandMemSetAsLoop(Memset, TTI); Changed = true; Memset->eraseFromParent(); break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index fdff21b6ef8df..76f1e006bbf74 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -635,7 +635,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst( MemSetInst &MSI) { if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) return false; - llvm::expandMemSetAsLoop(&MSI); + llvm::expandMemSetAsLoop(&MSI, + TM->getTargetTransformInfo(*MSI.getFunction())); MSI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 03d16fdd54c42..5a68dca1b10b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -80,7 +80,8 @@ static cl::opt<size_t> InlineMaxBB( static cl::opt<unsigned> MemcpyLoopUnroll( "amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " - "operations when lowering memcpy as a loop"), + "operations when lowering statically-sized memcpy, memmove, or" + "memset as a loop"), cl::init(16), cl::Hidden); static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index ac6f4061b9f1f..d0b50d2610bd5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -128,7 +128,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) { expandMemMoveAsLoop(Memmove, TTI); } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) { - expandMemSetAsLoop(Memset); + expandMemSetAsLoop(Memset, TTI); } MemCall->eraseFromParent(); } diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index be88f334d2171..8bee6da75cc75 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -23,6 +23,7 @@ #include "SPIRVTargetMachine.h" #include "SPIRVUtils.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/IRBuilder.h" @@ -93,7 +94,8 @@ static Function *getOrCreateFunction(Module *M, Type *RetTy, return NewF; } -static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) { +static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic, + const TargetTransformInfo &TTI) { // For @llvm.memset.* intrinsic cases with constant value and length arguments // are emulated via "storing" a constant array to the destination. For other // cases we wrap the intrinsic in @spirv.llvm_memset_* function and expand the @@ -137,7 +139,7 @@ static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) { auto *MemSet = IRB.CreateMemSet(Dest, Val, Len, MSI->getDestAlign(), MSI->isVolatile()); IRB.CreateRetVoid(); - expandMemSetAsLoop(cast<MemSetInst>(MemSet)); + expandMemSetAsLoop(cast<MemSetInst>(MemSet), TTI); MemSet->eraseFromParent(); break; } @@ -399,6 +401,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { bool Changed = false; const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F); SmallVector<Instruction *> EraseFromParent; + const TargetTransformInfo &TTI = TM.getTargetTransformInfo(*F); for (BasicBlock &BB : *F) { for (Instruction &I : make_early_inc_range(BB)) { auto Call = dyn_cast<CallInst>(&I); @@ -411,7 +414,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { switch (II->getIntrinsicID()) { case Intrinsic::memset: case Intrinsic::bswap: - Changed |= lowerIntrinsicToFunction(II); + Changed |= lowerIntrinsicToFunction(II, TTI); break; case Intrinsic::fshl: case Intrinsic::fshr: @@ -459,7 +462,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { return false; return II->getCalledFunction()->getName().starts_with(Prefix); })) - Changed |= lowerIntrinsicToFunction(II); + Changed |= lowerIntrinsicToFunction(II, TTI); break; } } diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp index 88e5d038bff82..07bfceb99d206 100644 --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -930,9 +930,187 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore, } } +/// Create a Value of \p DstType that consists of a sequence of copies of +/// \p SetValue, using bitcasts and a vector splat. +static Value *createMemSetSplat(const DataLayout &DL, IRBuilderBase &B, + Value *SetValue, Type *DstType) { + unsigned DstSize = DL.getTypeStoreSize(DstType); + Type *SetValueType = SetValue->getType(); + unsigned SetValueSize = DL.getTypeStoreSize(SetValueType); + assert(SetValueSize == DL.getTypeAllocSize(SetValueType) && + "Store size and alloc size of SetValue's type must match"); + assert(SetValueSize != 0 && DstSize % SetValueSize == 0 && + "DstType size must be a multiple of SetValue size"); + + Value *Result = SetValue; + if (DstSize != SetValueSize) { + if (!SetValueType->isIntegerTy() && !SetValueType->isFloatingPointTy()) { + // If the type cannot be put into a vector, bitcast to iN first. + LLVMContext &Ctx = SetValue->getContext(); + Result = B.CreateBitCast(Result, Type::getIntNTy(Ctx, SetValueSize * 8), + "setvalue.toint"); + } + // Form a sufficiently large vector consisting of SetValue, repeated. + Result = + B.CreateVectorSplat(DstSize / SetValueSize, Result, "setvalue.splat"); + } + + // The value has the right size, but we might have to bitcast it to the right + // type. + if (Result->getType() != DstType) { + Result = B.CreateBitCast(Result, DstType, "setvalue.splat.cast"); + } + return Result; +} + +static void createMemSetLoopKnownSize(Instruction *InsertBefore, Value *DstAddr, + ConstantInt *Len, Value *SetValue, + Align DstAlign, bool IsVolatile, + const TargetTransformInfo &TTI) { + // No need to expand zero length memsets. + if (Len->isZero()) + return; + + BasicBlock *PreLoopBB = InsertBefore->getParent(); + Function *ParentFunc = PreLoopBB->getParent(); + const DataLayout &DL = ParentFunc->getDataLayout(); + LLVMContext &Ctx = PreLoopBB->getContext(); + + unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); + + Type *TypeOfLen = Len->getType(); + Type *Int8Type = Type::getInt8Ty(Ctx); + assert(SetValue->getType() == Int8Type && "Can only set bytes"); + + // Use the same memory access type as for a memcpy with the same Dst and Src + // alignment and address space. + Type *LoopOpType = TTI.getMemcpyLoopLoweringType( + Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt); + unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); + + uint64_t LoopEndCount = alignDown(Len->getZExtValue(), LoopOpSize); + + if (LoopEndCount != 0) { + Value *SplatSetValue = nullptr; + { + IRBuilder<> PreLoopBuilder(InsertBefore); + SplatSetValue = + createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType); + } + + // Don't generate a residual loop, the remaining bytes are set with + // straight-line code. + LoopExpansionInfo LEI = + insertLoopExpansion(InsertBefore, Len, LoopOpSize, 0, "static-memset"); + + // Fill MainLoopBB + IRBuilder<> MainLoopBuilder(LEI.MainLoopIP); + Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize)); + + Value *DstGEP = + MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex); + + MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign, + IsVolatile); + + assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex && + "No residual loop was requested"); + } + + uint64_t BytesSet = LoopEndCount; + uint64_t RemainingBytes = Len->getZExtValue() - BytesSet; + if (RemainingBytes == 0) + return; + + IRBuilder<> RBuilder(InsertBefore); + + SmallVector<Type *, 5> RemainingOps; + TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, + DstAS, DstAS, DstAlign, DstAlign, + std::nullopt); + + Type *PreviousOpTy = nullptr; + Value *SplatSetValue = nullptr; + for (auto *OpTy : RemainingOps) { + unsigned OperandSize = DL.getTypeStoreSize(OpTy); + Align PartDstAlign(commonAlignment(DstAlign, BytesSet)); + + // Avoid recomputing the splat SetValue if it's the same as for the last + // iteration. + if (OpTy != PreviousOpTy) + SplatSetValue = createMemSetSplat(DL, RBuilder, SetValue, OpTy); + + Value *DstGEP = RBuilder.CreateInBoundsGEP( + Int8Type, DstAddr, ConstantInt::get(TypeOfLen, BytesSet)); + RBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign, + IsVolatile); + BytesSet += OperandSize; + PreviousOpTy = OpTy; + } + assert(BytesSet == Len->getZExtValue() && + "Bytes set should match size in the call!"); +} + +static void createMemSetLoopUnknownSize(Instruction *InsertBefore, + Value *DstAddr, Value *Len, + Value *SetValue, Align DstAlign, + bool IsVolatile, + const TargetTransformInfo &TTI) { + BasicBlock *PreLoopBB = InsertBefore->getParent(); + Function *ParentFunc = PreLoopBB->getParent(); + const DataLayout &DL = ParentFunc->getDataLayout(); + LLVMContext &Ctx = PreLoopBB->getContext(); + + unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); + + Type *Int8Type = Type::getInt8Ty(Ctx); + assert(SetValue->getType() == Int8Type && "Can only set bytes"); + + Type *LoopOpType = TTI.getMemcpyLoopLoweringType( + Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt); + unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); + + Type *ResidualLoopOpType = Int8Type; + unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType); + + Value *SplatSetValue = SetValue; + { + IRBuilder<> PreLoopBuilder(InsertBefore); + SplatSetValue = createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType); + } + + LoopExpansionInfo LEI = insertLoopExpansion( + InsertBefore, Len, LoopOpSize, ResidualLoopOpSize, "dynamic-memset"); + + // Fill MainLoopBB + IRBuilder<> MainLoopBuilder(LEI.MainLoopIP); + Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize)); + + Value *DstGEP = + MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex); + MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign, + IsVolatile); + + // Fill ResidualLoopBB + if (!LEI.ResidualLoopIP) + return; + + Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize)); + + IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP); + + Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, + LEI.ResidualLoopIndex); + ResLoopBuilder.CreateAlignedStore(SetValue, ResDstGEP, ResDstAlign, + IsVolatile); +} + static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr, Value *CopyLen, Value *SetValue, Align DstAlign, bool IsVolatile) { + // Currently no longer used for memset, only for memset.pattern. + // TODO: Update the memset.pattern lowering to also use the loop expansion + // framework and remove this function. Type *TypeOfCopyLen = CopyLen->getType(); BasicBlock *OrigBB = InsertBefore->getParent(); Function *F = OrigBB->getParent(); @@ -1067,13 +1245,25 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove, return true; } -void llvm::expandMemSetAsLoop(MemSetInst *Memset) { - createMemSetLoop(/* InsertBefore */ Memset, - /* DstAddr */ Memset->getRawDest(), - /* CopyLen */ Memset->getLength(), - /* SetValue */ Memset->getValue(), - /* Alignment */ Memset->getDestAlign().valueOrOne(), - Memset->isVolatile()); +void llvm::expandMemSetAsLoop(MemSetInst *Memset, + const TargetTransformInfo &TTI) { + if (ConstantInt *CI = dyn_cast<ConstantInt>(Memset->getLength())) { + createMemSetLoopKnownSize( + /* InsertBefore */ Memset, + /* DstAddr */ Memset->getRawDest(), + /* Len */ CI, + /* SetValue */ Memset->getValue(), + /* DstAlign */ Memset->getDestAlign().valueOrOne(), + Memset->isVolatile(), TTI); + } else { + createMemSetLoopUnknownSize( + /* InsertBefore */ Memset, + /* DstAddr */ Memset->getRawDest(), + /* Len */ Memset->getLength(), + /* SetValue */ Memset->getValue(), + /* DstAlign */ Memset->getDestAlign().valueOrOne(), + Memset->isVolatile(), TTI); + } } void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll index 04652af147f9b..4d35f3198bc0a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll @@ -1,27 +1,87 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s +; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s +; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1) define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) { ; LOOP-LABEL: memset_p1i8: -; LOOP: ; %bb.0: ; %loadstoreloop.preheader +; LOOP: ; %bb.0: +; LOOP-NEXT: v_and_b32_e32 v3, 0xff, v2 ; LOOP-NEXT: s_mov_b64 s[0:1], 0 ; LOOP-NEXT: s_mov_b32 s2, 0 ; LOOP-NEXT: s_mov_b32 s3, 0xf000 +; LOOP-NEXT: v_lshlrev_b32_e32 v4, 8, v3 +; LOOP-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; LOOP-NEXT: v_lshlrev_b32_e32 v6, 24, v3 +; LOOP-NEXT: v_or_b32_e32 v3, v3, v4 +; LOOP-NEXT: v_or_b32_e32 v3, v3, v5 +; LOOP-NEXT: v_or_b32_e32 v5, v3, v6 +; LOOP-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; LOOP-NEXT: v_bfe_u32 v7, v5, 8, 8 +; LOOP-NEXT: v_lshrrev_b32_e32 v8, 24, v5 ; LOOP-NEXT: v_mov_b32_e32 v4, s1 ; LOOP-NEXT: v_mov_b32_e32 v3, s0 -; LOOP-NEXT: .LBB0_1: ; %loadstoreloop +; LOOP-NEXT: .LBB0_1: ; %static-memset-expansion-main-body ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 -; LOOP-NEXT: v_add_i32_e32 v5, vcc, v0, v3 -; LOOP-NEXT: v_addc_u32_e32 v6, vcc, v1, v4, vcc -; LOOP-NEXT: v_add_i32_e32 v3, vcc, 1, v3 +; LOOP-NEXT: v_add_i32_e32 v9, vcc, v0, v3 +; LOOP-NEXT: v_addc_u32_e32 v10, vcc, v1, v4, vcc +; LOOP-NEXT: v_add_i32_e32 v3, vcc, 32, v3 ; LOOP-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 4, v3 -; LOOP-NEXT: buffer_store_byte v2, v[5:6], s[0:3], 0 addr64 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:1 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:2 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:3 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:4 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:5 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:6 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:7 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:8 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:9 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:10 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:11 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:12 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:13 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:14 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:15 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:17 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:18 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:19 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:20 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:21 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:22 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:23 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:25 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:26 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:27 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:29 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:30 +; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 32, v3 +; LOOP-NE... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/169040 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
