https://gcc.gnu.org/g:fa38f8c2e043ffddaff023174926b0a30f002033
commit r15-11331-gfa38f8c2e043ffddaff023174926b0a30f002033 Author: Kyrylo Tkachov <[email protected]> Date: Mon Apr 20 00:56:01 2026 -0700 aarch64: PR124908 Fix ICE in svld1rq fold with -msve-vector-bits=128 svld1rq is a replicated-quadword load: it loads 16 bytes and replicates them to fill the SVE register. When -msve-vector-bits=128 the instruction can be folded to a normal load. The GIMPLE fold for svld1rq transforms the intrinsic into a 128-bit memory load followed by a VEC_PERM_EXPR that replicates the loaded value. When VL == 128, the VEC_PERM_EXPR becomes an identity permutation. The checking assertion that validates the permutation (can_vec_perm_const_p) fails for this degenerate case because the vec_perm_const hook does not recognise the cross-mode identity permutation (e.g. V16QI -> VNx16QI). Fix by detecting when the SVE vector has the same number of elements as the 128-bit quadword (known_eq (lhs_len, source_nelts)) and emitting a VIEW_CONVERT_EXPR instead of a VEC_PERM_EXPR. Bootstrapped and tested on aarch64-none-linux-gnu. Signed-off-by: Kyrylo Tkachov <[email protected]> gcc/ChangeLog: PR target/124908 * config/aarch64/aarch64-sve-builtins-base.cc (svld1rq_impl::fold): When the SVE vector length equals the quadword width, emit VIEW_CONVERT_EXPR instead of VEC_PERM_EXPR. gcc/testsuite/ChangeLog: PR target/124908 * gcc.target/aarch64/sve/acle/general/ld1rq_2.c: New test. (cherry picked from commit b8ac2356c691f76c19246c4c7b94c23015b8b8aa) Diff: --- gcc/config/aarch64/aarch64-sve-builtins-base.cc | 14 ++++++++ .../gcc.target/aarch64/sve/acle/general/ld1rq_2.c | 37 ++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 7e6c25d0a7c3..eec244c412c2 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -1830,6 +1830,20 @@ public: gimple_seq_add_stmt_without_update (&stmts, mem_ref_stmt); int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant (); + + /* When the SVE vector has the same number of elements as the + 128-bit quadword (i.e. VL == 128), the load fills the entire + register and no replication is needed. Just convert the + loaded value from the Advanced SIMD type to the SVE type. */ + if (known_eq (lhs_len, (unsigned int) source_nelts)) + { + gimple *g + = gimple_build_assign (lhs, build1 (VIEW_CONVERT_EXPR, + lhs_type, mem_ref_lhs)); + gimple_seq_add_stmt_without_update (&stmts, g); + gsi_replace_with_seq_vops (f.gsi, stmts); + return g; + } vec_perm_builder sel (lhs_len, source_nelts, 1); for (int i = 0; i < source_nelts; i++) sel.quick_push (i); diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ld1rq_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ld1rq_2.c new file mode 100644 index 000000000000..84bf77328c5b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ld1rq_2.c @@ -0,0 +1,37 @@ +/* PR target/124908 */ +/* { dg-options "-O2 -msve-vector-bits=128" } */ + +#include <arm_sve.h> + +/* Verify that folding svld1rq does not ICE with -msve-vector-bits=128. */ + +svuint8_t +f_u8 (const uint8_t *p) +{ + return svld1rq_u8 (svptrue_b8 (), p); +} + +svint8_t +f_s8 (const int8_t *p) +{ + return svld1rq_s8 (svptrue_b8 (), p); +} + +svuint16_t +f_u16 (const uint16_t *p) +{ + return svld1rq_u16 (svptrue_b16 (), p); +} + +svuint32_t +f_u32 (const uint32_t *p) +{ + return svld1rq_u32 (svptrue_b32 (), p); +} + +svfloat64_t +f_f64 (const float64_t *p) +{ + return svld1rq_f64 (svptrue_b64 (), p); +} +
