llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-transforms Author: Ricardo Jesus (rj-jesus) <details> <summary>Changes</summary> Currently, given: ```cpp svuint8_t foo(uint8_t *x) { return svld1(svptrue_b8(), x); } ``` We generate: ```gas foo: ptrue p0.b ld1b { z0.b }, p0/z, [x0] ret ``` On little-endian, we could instead be using LDR as follows: ```gas foo: ldr z0, [x0] ret ``` The second form avoids the predicate dependency. Likewise for other types and stores. This generates a fair number of test changes, but all but `llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll` seem benign. --- Patch is 413.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127837.diff 69 Files Affected: - (modified) clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c (+6-3) - (modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+14) - (modified) llvm/lib/Target/AArch64/SVEInstrFormats.td (+1) - (modified) llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll (+24-28) - (modified) llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll (+6-7) - (modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll (+18-18) - (modified) llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll (+1-2) - (modified) llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll (+8-8) - (modified) llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll (+48-53) - (modified) llvm/test/CodeGen/AArch64/nontemporal-load.ll (+10-12) - (modified) llvm/test/CodeGen/AArch64/sinksplat.ll (+1-1) - (modified) llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll (+161-137) - (modified) llvm/test/CodeGen/AArch64/sme-peephole-opts.ll (+2-4) - (modified) llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll (+43-86) - (modified) llvm/test/CodeGen/AArch64/sme-streaming-interface.ll (+1-2) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll (+6-12) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll (+4-6) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll (+22-37) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll (+14-28) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll (+14-28) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll (+9-18) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll (+8-16) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll (+4-8) - (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll (+6-12) - (modified) llvm/test/CodeGen/AArch64/spillfill-sve.ll (+32-32) - (modified) llvm/test/CodeGen/AArch64/split-vector-insert.ll (+24-28) - (modified) llvm/test/CodeGen/AArch64/stack-guard-sve.ll (+6-8) - (modified) llvm/test/CodeGen/AArch64/stack-hazard.ll (+9-18) - (modified) llvm/test/CodeGen/AArch64/sve-aliasing.ll (+26-26) - (modified) llvm/test/CodeGen/AArch64/sve-alloca.ll (+2-3) - (modified) llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll (+6-11) - (modified) llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll (+152-168) - (modified) llvm/test/CodeGen/AArch64/sve-dead-masked-store.ll (+2-4) - (modified) llvm/test/CodeGen/AArch64/sve-extload-icmp.ll (+4-4) - (modified) llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll (+44-53) - (modified) llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll (+7-13) - (modified) llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll (+11-11) - (modified) llvm/test/CodeGen/AArch64/sve-fp.ll (+2-3) - (modified) llvm/test/CodeGen/AArch64/sve-fpext-load.ll (+2-4) - (modified) llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll (+10-11) - (modified) llvm/test/CodeGen/AArch64/sve-insert-element.ll (+4-4) - (modified) llvm/test/CodeGen/AArch64/sve-insert-vector.ll (+49-61) - (modified) llvm/test/CodeGen/AArch64/sve-int-arith.ll (+1-1) - (modified) llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll (+10-22) - (modified) llvm/test/CodeGen/AArch64/sve-ld1r.ll (+5-9) - (modified) llvm/test/CodeGen/AArch64/sve-llrint.ll (+655-648) - (modified) llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll (+8-12) - (modified) llvm/test/CodeGen/AArch64/sve-lrint.ll (+655-648) - (modified) llvm/test/CodeGen/AArch64/sve-lsrchain.ll (+20-20) - (modified) llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll (+8-9) - (modified) llvm/test/CodeGen/AArch64/sve-min-max-pred.ll (+1-1) - (modified) llvm/test/CodeGen/AArch64/sve-pr92779.ll (+1-2) - (modified) llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll (+3-3) - (modified) llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll (+3-3) - (modified) llvm/test/CodeGen/AArch64/sve-reassocadd.ll (+9-18) - (modified) llvm/test/CodeGen/AArch64/sve-redundant-store.ll (+8-14) - (modified) llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll (+18-25) - (modified) llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll (+28-33) - (modified) llvm/test/CodeGen/AArch64/sve-split-load.ll (+17-21) - (modified) llvm/test/CodeGen/AArch64/sve-split-store.ll (+14-17) - (modified) llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll (+10-22) - (modified) llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll (+9-14) - (modified) llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll (+2-3) - (modified) llvm/test/CodeGen/AArch64/sve-vector-compress.ll (+3-3) - (modified) llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll (+11-11) - (modified) llvm/test/CodeGen/AArch64/sve2-rsh.ll (+1-2) - (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll (+16-24) - (modified) llvm/test/CodeGen/AArch64/zext-to-tbl.ll (+6-6) - (modified) llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll (+29-38) ``````````diff diff --git a/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c b/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c index 692d11d97f486..0ed14b4b3b793 100644 --- a/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c +++ b/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c @@ -13,9 +13,12 @@ void func(int *restrict a, int *restrict b) { // CHECK-LABEL: func -// CHECK256-COUNT-8: st1w -// CHECK512-COUNT-4: st1w -// CHECK1024-COUNT-2: st1w +// CHECK256-COUNT-1: str +// CHECK256-COUNT-7: st1w +// CHECK512-COUNT-1: str +// CHECK512-COUNT-3: st1w +// CHECK1024-COUNT-1: str +// CHECK1024-COUNT-1: st1w // CHECK2048-COUNT-1: st1w #pragma clang loop vectorize(enable) for (int i = 0; i < 64; ++i) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 28aecd14e33fa..d1393aebe3ad9 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2977,14 +2977,28 @@ let Predicates = [HasSVE_or_SME] in { // Allow using the reg+reg form of ld1b/st1b for memory accesses with the // same width as nxv16i8. This saves an add in cases where we would // otherwise compute the address separately. + // Also allow using LDR/STR to avoid the predicate dependence. multiclass unpred_loadstore_bitcast<ValueType Ty> { let Predicates = [IsLE] in { def : Pat<(Ty (load (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset))), (LD1B (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>; def : Pat<(store Ty:$val, (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset)), (ST1B ZPR:$val, (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>; + + let AddedComplexity = 2 in { + def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))), + (LDR_ZXI GPR64sp:$base, simm9:$offset)>; + def : Pat<(store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)), + (STR_ZXI ZPR:$val, GPR64sp:$base, simm9:$offset)>; + } + + def : Pat<(Ty (load GPR64sp:$base)), + (LDR_ZXI GPR64sp:$base, (i64 0))>; + def : Pat<(store Ty:$val, GPR64sp:$base), + (STR_ZXI ZPR:$val, GPR64sp:$base, (i64 0))>; } } + defm : unpred_loadstore_bitcast<nxv16i8>; defm : unpred_loadstore_bitcast<nxv8i16>; defm : unpred_loadstore_bitcast<nxv8f16>; defm : unpred_loadstore_bitcast<nxv8bf16>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index e443c5ab150bd..48f71297f8377 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -9668,6 +9668,7 @@ multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatter let WantsRoot = true in { def am_sve_indexed_s4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-8, 7>">; def am_sve_indexed_s6 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-32, 31>">; + def am_sve_indexed_s9 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-256, 255>">; } def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", []>; diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll index 7244ac949ab88..3a808f5a02f0d 100644 --- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll +++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll @@ -13,13 +13,12 @@ define void @array_1D(ptr %addr) #0 { ; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #2, mul vl] +; CHECK-NEXT: ldr z2, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -37,8 +36,7 @@ define %my_subtype @array_1D_extract(ptr %addr) #0 { ; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ldr z0, [x0, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -56,12 +54,11 @@ define void @array_1D_insert(ptr %addr, %my_subtype %elt) #0 { ; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [sp] +; CHECK-NEXT: ldr z1, [x0, #2, mul vl] +; CHECK-NEXT: ldr z2, [x0] +; CHECK-NEXT: str z0, [sp, #1, mul vl] +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -80,19 +77,18 @@ define void @array_2D(ptr %addr) #0 { ; CHECK-NEXT: addvl sp, sp, #-6 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 48 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #5, mul vl] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #4, mul vl] -; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0, #3, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #5, mul vl] -; CHECK-NEXT: st1d { z3.d }, p0, [sp, #4, mul vl] -; CHECK-NEXT: st1d { z5.d }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1d { z4.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #5, mul vl] +; CHECK-NEXT: ldr z2, [x0, #1, mul vl] +; CHECK-NEXT: ldr z3, [x0, #4, mul vl] +; CHECK-NEXT: ldr z4, [x0, #2, mul vl] +; CHECK-NEXT: ldr z5, [x0, #3, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: str z1, [sp, #5, mul vl] +; CHECK-NEXT: str z3, [sp, #4, mul vl] +; CHECK-NEXT: str z5, [sp, #3, mul vl] +; CHECK-NEXT: str z4, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #6 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll index f03a6f018d34d..e7d8f4ff39cee 100644 --- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll +++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll @@ -12,13 +12,12 @@ define void @test(ptr %addr) #0 { ; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x0, #2, mul vl] +; CHECK-NEXT: ldr z2, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll index 8e26ef6b87ecc..668dc18df6a0b 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll @@ -25,11 +25,11 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ldr z2, [x0, #1, mul vl] +; CHECK-NEXT: ldr z3, [x0] ; CHECK-NEXT: subs x9, x9, x8 -; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1] +; CHECK-NEXT: ldr z4, [x1, #1, mul vl] +; CHECK-NEXT: ldr z5, [x1] ; CHECK-NEXT: add x1, x1, x10 ; CHECK-NEXT: add x0, x0, x10 ; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0 @@ -114,11 +114,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z1.d, z1.d, z3.d ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ldr z2, [x0, #1, mul vl] +; CHECK-NEXT: ldr z3, [x0] ; CHECK-NEXT: subs x9, x9, x8 -; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1] +; CHECK-NEXT: ldr z4, [x1, #1, mul vl] +; CHECK-NEXT: ldr z5, [x1] ; CHECK-NEXT: add x1, x1, x10 ; CHECK-NEXT: add x0, x0, x10 ; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0 @@ -196,16 +196,16 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0] +; CHECK-NEXT: ldr z4, [x0, #1, mul vl] +; CHECK-NEXT: ldr z5, [x0] ; CHECK-NEXT: subs x9, x9, x8 -; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #3, mul vl] -; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1, #1, mul vl] -; CHECK-NEXT: ld1d { z16.d }, p0/z, [x1] -; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ldr z6, [x0, #3, mul vl] +; CHECK-NEXT: ldr z7, [x1, #1, mul vl] +; CHECK-NEXT: ldr z16, [x1] +; CHECK-NEXT: ldr z17, [x0, #2, mul vl] ; CHECK-NEXT: add x0, x0, x10 -; CHECK-NEXT: ld1d { z18.d }, p0/z, [x1, #3, mul vl] -; CHECK-NEXT: ld1d { z19.d }, p0/z, [x1, #2, mul vl] +; CHECK-NEXT: ldr z18, [x1, #3, mul vl] +; CHECK-NEXT: ldr z19, [x1, #2, mul vl] ; CHECK-NEXT: add x1, x1, x10 ; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0 ; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0 @@ -321,8 +321,8 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia ; CHECK-NEXT: zip1 z1.d, z2.d, z2.d ; CHECK-NEXT: .LBB3_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ldr z3, [x0] +; CHECK-NEXT: ldr z4, [x0, #1, mul vl] ; CHECK-NEXT: add x0, x0, x11 ; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2] ; CHECK-NEXT: add x8, x8, x9 diff --git a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll index e6d5a2ac0fd79..820bc2c8a417f 100644 --- a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll +++ b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll @@ -97,8 +97,7 @@ define void @test_concat_fptrunc_v4f64_to_v4f32(ptr %ptr) #1 { ; CHECK-LABEL: test_concat_fptrunc_v4f64_to_v4f32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov z0.s, #1.00000000 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret entry: %0 = shufflevector <vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll index 542b2e90ffc15..d5b9d17a98d55 100644 --- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll @@ -103,9 +103,9 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(ptr %a, ptr % ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #1 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: str q1, [x9, x8] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -147,9 +147,9 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_nonzero_i16(ptr %a, ptr ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #2 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: str q1, [x9, x8] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -191,9 +191,9 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_i32(ptr %a, ptr ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: str q1, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -211,10 +211,10 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_large_i32(ptr % ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ptrue p1.d, vl8 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1] ; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: ldr z0, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll index d1171bc312473..69e805d9ca2ee 100644 --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -328,15 +328,14 @@ define <vscale x 8 x i32> @splice_nxv8i32_idx(<vscale x 8 x i32> %a, <vscale x 8 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: orr x8, x8, #0x8 -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: str z3, [sp, #3, mul vl] +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: ldr z0, [x8] +; CHECK-NEXT: ldr z1, [x8, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -354,22 +353,22 @@ define <vscale x 16 x float> @splice_nxv16f32_16(<vscale x 16 x float> %a, <vsca ; CHECK-NEXT: mov w9, #16 // =0x10 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: str z3, [sp, #3, mul vl] ; CHECK-NEXT: cmp x8, #16 -; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl] +; CHECK-NEXT: str z2, [sp, #2, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: add x10, x9, x8, lsl #2 -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z7.s }, p0, [sp, #7, mul vl] -; CHECK-NEXT: st1w { z4.s }, p0, [sp, #4, mul vl] -; CHECK-NEXT: st1w { z5.s }, p0, [sp, #5, mul vl] -; CHECK-NEXT: st1w { z6.s }, p0, [sp, #6, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: str z7, [sp, #7, mul vl] +; CHECK-NEXT: str z4, [sp, #4, mul vl] +; CHECK-NEXT: str z5, [sp, #5, mul vl] +; CHECK-NEXT: str z6, [sp, #6, mul vl] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, x8, lsl #2] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x10, #1, mul vl] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x10, #2, mul vl] -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x10, #3, mul vl] +; CHECK-NEXT: ldr z1, [x10, #1, mul vl] +; CHECK-NEXT: ldr z2, [x10, #2, mul vl] +; CHECK-NEXT: ldr z3, [x10, #3, mul vl] ; CHECK-NEXT: addvl sp, sp, #8 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -453,16 +452,15 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg17(<vscale x 16 x i8> %a, <vscale x ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w9, #17 // =0x11 -; CHECK-NEXT: cmp x8, #17 ; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x8, #17 +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: csel x9, x8, x9, lo ; CHECK-NEXT: add x8, x10, x8 -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: ldr z0, [x8] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -498,16 +496,15 @@ define <vscale x 8 x i16> @splice_nxv8i16_neg9(<vscale x 8 x i16> %a, <vscale x ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w9, #18 // =0x12 -; CHECK-NEXT: cmp x8, #18 ; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x8, #18 +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: csel x9, x8, x9, lo ; CHECK-NEXT: add x8, x10, x8 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] ; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: ldr z0, [x8] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -609,16 +606,15 @@ define <vscale x 8 x half> @splice_nxv8f16_neg9(<vscale x 8 x half> %a, <vscale ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w9, #18 // =0x12 -; CHECK-NEXT: cmp x8, #18 ; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x8, #18 +; CHECK-NEXT: str z0, [sp] ; CHECK-NEXT: csel x9, x8, x9, lo ; CHECK-NEXT: add x8, x10, x8 -; CHECK-NEXT: st1... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/127837 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits