llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-aarch64 Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> Use LLSC or cmpxchg in the same cases as for the unsupported integer operations. This required some fixups to the LLSC implementatation to deal with the fp128 case. --- Patch is 88.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/103702.diff 5 Files Affected: - (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+18-12) - (modified) llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll (+94-298) - (modified) llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll (+107-311) - (modified) llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll (+107-311) - (modified) llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll (+94-298) ``````````diff diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 40d9fa4f2b494a..26f698898e487b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -27061,9 +27061,6 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes."); - if (AI->isFloatingPointOperation()) - return AtomicExpansionKind::CmpXChg; - bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 && (AI->getOperation() == AtomicRMWInst::Xchg || AI->getOperation() == AtomicRMWInst::Or || @@ -27073,7 +27070,8 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { // Nand is not supported in LSE. // Leave 128 bits to LLSC or CmpXChg. - if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) { + if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 && + !AI->isFloatingPointOperation()) { if (Subtarget->hasLSE()) return AtomicExpansionKind::None; if (Subtarget->outlineAtomics()) { @@ -27146,10 +27144,14 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); - Lo = Builder.CreateZExt(Lo, ValueTy, "lo64"); - Hi = Builder.CreateZExt(Hi, ValueTy, "hi64"); - return Builder.CreateOr( - Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64"); + + auto *Int128Ty = Type::getInt128Ty(Builder.getContext()); + Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64"); + Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64"); + + Value *Or = Builder.CreateOr( + Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64"); + return Builder.CreateBitCast(Or, ValueTy); } Type *Tys[] = { Addr->getType() }; @@ -27160,8 +27162,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, const DataLayout &DL = M->getDataLayout(); IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy)); CallInst *CI = Builder.CreateCall(Ldxr, Addr); - CI->addParamAttr( - 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy)); + CI->addParamAttr(0, Attribute::get(Builder.getContext(), + Attribute::ElementType, IntEltTy)); Value *Trunc = Builder.CreateTrunc(CI, IntEltTy); return Builder.CreateBitCast(Trunc, ValueTy); @@ -27187,9 +27189,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; Function *Stxr = Intrinsic::getDeclaration(M, Int); Type *Int64Ty = Type::getInt64Ty(M->getContext()); + Type *Int128Ty = Type::getInt128Ty(M->getContext()); + + Value *CastVal = Builder.CreateBitCast(Val, Int128Ty); - Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); - Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); + Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo"); + Value *Hi = + Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi"); return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index f95caf325b197c..2c6461097f7d9b 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -6,33 +6,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB0_2 ; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB0_5 -; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fadd s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB0_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB0_3 -; NOLSE-NEXT: b .LBB0_1 -; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB0_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -63,33 +47,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB1_2 ; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB1_5 -; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fadd s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB1_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB1_3 -; NOLSE-NEXT: b .LBB1_1 -; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB1_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -122,19 +90,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB2_2 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB2_5 -; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fadd s2, s2, s1 @@ -143,21 +104,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB2_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB2_3 -; NOLSE-NEXT: b .LBB2_1 -; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB2_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -199,19 +148,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB3_2 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB3_5 -; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fadd s2, s2, s1 @@ -220,21 +162,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB3_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB3_3 -; NOLSE-NEXT: b .LBB3_1 -; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB3_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -273,31 +203,15 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr s1, [x0] -; NOLSE-NEXT: b .LBB4_2 ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB4_5 -; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 ; NOLSE-NEXT: fadd s2, s1, s0 -; NOLSE-NEXT: fmov w9, s1 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB4_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB4_3 -; NOLSE-NEXT: b .LBB4_1 -; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB4_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov s0, s1 ; NOLSE-NEXT: ret ; @@ -324,31 +238,15 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB5_2 ; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB5_5 -; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fadd d2, d1, d0 -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB5_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB5_3 -; NOLSE-NEXT: b .LBB5_1 -; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB5_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -375,54 +273,26 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) # define fp128 @test_atomicrmw_fadd_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align16: ; NOLSE: // %bb.0: -; NOLSE-NEXT: sub sp, sp, #96 -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: sub sp, sp, #80 +; NOLSE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; NOLSE-NEXT: mov x19, x0 -; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill -; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; NOLSE-NEXT: stp x12, x13, [sp, #32] -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: ldr q1, [sp, #32] -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: b.eq .LBB6_6 -; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill -; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x19] +; NOLSE-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: stp x8, x9, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #48] +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill ; NOLSE-NEXT: bl __addtf3 -; NOLSE-NEXT: str q0, [sp, #48] -; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; NOLSE-NEXT: ldp x9, x8, [sp, #48] -; NOLSE-NEXT: str q0, [sp, #64] -; NOLSE-NEXT: ldp x11, x10, [sp, #64] -; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x19] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB6_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x9, x8, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: str q0, [sp, #32] +; NOLSE-NEXT: ldp x9, x8, [sp, #32] +; NOLSE-NEXT: stlxp w10, x9, x8, [x19] +; NOLSE-NEXT: cbnz w10, .LBB6_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; NOLSE-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #80 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align16: @@ -463,35 +333,19 @@ define fp128 @test_atomicrmw_fadd_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0 define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: fcvtl v1.4s, v0.4h -; NOLSE-NEXT: ldr s0, [x0] -; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: fcvtl v0.4s, v0.4h ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB7_5 -; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 -; NOLSE-NEXT: fcvtl v2.4s, v0.4h -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fadd v2.4s, v2.4s, v1.4s -; NOLSE-NEXT: fcvtn v2.4h, v2.4s -; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB7_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB7_3 -; NOLSE-NEXT: b .LBB7_1 -; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 +; NOLSE-NEXT: fcvtl v1.4s, v1.4h +; NOLSE-NEXT: fadd v1.4s, v1.4s, v0.4s +; NOLSE-NEXT: fcvtn v1.4h, v1.4s +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB7_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4: @@ -522,38 +376,22 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE: // %bb.0: ; NOLSE-NEXT: movi v1.4s, #1 ; NOLSE-NEXT: movi v2.4s, #127, msl #8 -; NOLSE-NEXT: shll v3.4s, v0.4h, #16 -; NOLSE-NEXT: ldr s0, [x0] -; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: shll v0.4s, v0.4h, #16 ; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB8_5 -; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; NOLSE-NEXT: shll v4.4s, v0.4h, #16 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fadd v4.4s, v4.4s, v3.4s -; NOLSE-NEXT: ushr v5.4s, v4.4s, #16 -; NOLSE-NEXT: and v5.16b, v5.16b, v1.16b -; NOLSE-NEXT: add v4.4s, v5.4s, v4.4s -; NOLSE-NEXT: addhn v4.4h, v4.4s, v2.4s -; NOLSE-NEXT: fmov w8, s4 -; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB8_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB8_3 -; NOLSE-NEXT: b .LBB8_1 -; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s3, w8 +; NOLSE-NEXT: shll ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/103702 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits