On Wed, Jul 9, 2025 at 7:09 AM Richard Sandiford <richard.sandif...@arm.com> wrote: > > When using SVE INDEX to load an Advanced SIMD vector, we need to > take account of the different element ordering for big-endian > targets. For example, when big-endian targets store the V4SI > constant { 0, 1, 2, 3 } in registers, 0 becomes the most > significant element, whereas INDEX always operates from the > least significant element. A big-endian target would therefore > load V4SI { 0, 1, 2, 3 } using: > > INDEX Z0.S, #3, #-1 > > rather than little-endian's: > > INDEX Z0.S, #0, #1 > > While there, I noticed that we would only check the first vector > in a multi-vector SVE constant, which would trigger an ICE if the > other vectors turned out to be invalid. This is pretty difficult to > trigger at the moment, since we only allow single-register modes to be > used as frontend & middle-end vector modes, but it can be seen using > the RTL frontend. > > Tested on aarch64-linux-gnu and aarch64_be-elf. OK to install?
When I was reviewing the original index patch internally I was worried about this but it looks like I didn't do a thorough enough job at it. Anyways this is ok. Thanks, Andrew > > Richard > > > gcc/ > * config/aarch64/aarch64.cc (aarch64_sve_index_series_p): New > function, split out from... > (aarch64_simd_valid_imm): ...here. Account for the different > SVE and Advanced SIMD element orders on big-endian targets. > Check each vector in a structure mode. > > gcc/testsuite/ > * gcc.dg/rtl/aarch64/vec-series-1.c: New test. > * gcc.dg/rtl/aarch64/vec-series-2.c: Likewise. > * gcc.target/aarch64/sve/acle/general/dupq_2.c: Fix expected > output for this big-endian test. > * gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise. > * gcc.target/aarch64/sve/vec_init_3.c: Restrict to little-endian > targets and add more tests. > * gcc.target/aarch64/sve/vec_init_4.c: New big-endian version > of vec_init_3.c. > --- > gcc/config/aarch64/aarch64.cc | 59 ++++- > .../gcc.dg/rtl/aarch64/vec-series-1.c | 35 +++ > .../gcc.dg/rtl/aarch64/vec-series-2.c | 35 +++ > .../aarch64/sve/acle/general/dupq_2.c | 2 +- > .../aarch64/sve/acle/general/dupq_4.c | 2 +- > .../gcc.target/aarch64/sve/vec_init_3.c | 114 +++++++++- > .../gcc.target/aarch64/sve/vec_init_4.c | 209 ++++++++++++++++++ > 7 files changed, 446 insertions(+), 10 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/vec-series-1.c > create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/vec-series-2.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_4.c > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index ce25f4f6f9f..6d5b2009b2a 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -23074,6 +23074,58 @@ aarch64_sve_index_immediate_p (rtx base_or_step) > && IN_RANGE (INTVAL (base_or_step), -16, 15)); > } > > +/* Return true if SERIES is a constant vector that can be loaded using > + an immediate SVE INDEX, considering both SVE and Advanced SIMD modes. > + When returning true, store the base in *BASE_OUT and the step > + in *STEP_OUT. */ > + > +static bool > +aarch64_sve_index_series_p (rtx series, rtx *base_out, rtx *step_out) > +{ > + rtx base, step; > + if (!const_vec_series_p (series, &base, &step) > + || !CONST_INT_P (base) > + || !CONST_INT_P (step)) > + return false; > + > + auto mode = GET_MODE (series); > + auto elt_mode = as_a<scalar_int_mode> (GET_MODE_INNER (mode)); > + unsigned int vec_flags = aarch64_classify_vector_mode (mode); > + if (BYTES_BIG_ENDIAN && (vec_flags & VEC_ADVSIMD)) > + { > + /* On big-endian targets, architectural lane 0 holds the last element > + for Advanced SIMD and the first element for SVE; see the comment at > + the head of aarch64-sve.md for details. This means that, from an SVE > + point of view, an Advanced SIMD series goes from the last element to > + the first. */ > + auto i = GET_MODE_NUNITS (mode).to_constant () - 1; > + base = gen_int_mode (UINTVAL (base) + i * UINTVAL (step), elt_mode); > + step = gen_int_mode (-UINTVAL (step), elt_mode); > + } > + > + if (!aarch64_sve_index_immediate_p (base) > + || !aarch64_sve_index_immediate_p (step)) > + return false; > + > + /* If the mode spans multiple registers, check that each subseries is > + in range. */ > + unsigned int nvectors = aarch64_ldn_stn_vectors (mode); > + if (nvectors != 1) > + { > + unsigned int nunits; > + if (!GET_MODE_NUNITS (mode).is_constant (&nunits)) > + return false; > + nunits /= nvectors; > + for (unsigned int i = 1; i < nvectors; ++i) > + if (!IN_RANGE (INTVAL (base) + i * nunits * INTVAL (step), -16, 15)) > + return false; > + } > + > + *base_out = base; > + *step_out = step; > + return true; > +} > + > /* Return true if X is a valid immediate for the SVE ADD and SUB instructions > when applied to mode MODE. Negate X first if NEGATE_P is true. */ > > @@ -23522,13 +23574,8 @@ aarch64_simd_valid_imm (rtx op, simd_immediate_info > *info, > n_elts = CONST_VECTOR_NPATTERNS (op); > else if (which == AARCH64_CHECK_MOV > && TARGET_SVE > - && const_vec_series_p (op, &base, &step)) > + && aarch64_sve_index_series_p (op, &base, &step)) > { > - gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); > - if (!aarch64_sve_index_immediate_p (base) > - || !aarch64_sve_index_immediate_p (step)) > - return false; > - > if (info) > { > /* Get the corresponding container mode. E.g. an INDEX on V2SI > diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/vec-series-1.c > b/gcc/testsuite/gcc.dg/rtl/aarch64/vec-series-1.c > new file mode 100644 > index 00000000000..6f795c68ba4 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/rtl/aarch64/vec-series-1.c > @@ -0,0 +1,35 @@ > +/* { dg-do compile { target aarch64*-*-* } } */ > +/* { dg-options "-O2 -msve-vector-bits=256 -mlittle-endian" } */ > + > +#include <arm_sve.h> > + > +#pragma GCC target "+sve" > + > +svint64x2_t __RTL (startwith ("vregs")) foo () > +{ > + (function "foo" > + (insn-chain > + (block 2 > + (edge-from entry (flags "FALLTHRU")) > + (cnote 1 [bb 2] NOTE_INSN_BASIC_BLOCK) > + (cnote 2 NOTE_INSN_FUNCTION_BEG) > + (insn 3 (set (reg:VNx4DI <0>) > + (const_vector:VNx4DI [(const_int 11) > + (const_int 12) > + (const_int 13) > + (const_int 14) > + (const_int 15) > + (const_int 16) > + (const_int 17) > + (const_int 18)]))) > + (insn 4 (set (reg:VNx4DI v0) (reg:VNx4DI <0>))) > + (insn 5 (use (reg:VNx4DI v0))) > + (edge-to exit (flags "FALLTHRU")) > + ) ;; block 2 > + ) ;; insn-chain > + (crtl (return_rtx (reg:VNx4DI v0))) > + ) ;; function > +} > + > +/* { dg-final { scan-assembler {\tindex\tz0\.d, #11, #1\n} } } */ > +/* { dg-final { scan-assembler {\tindex\tz1\.d, #15, #1\n} } } */ > diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/vec-series-2.c > b/gcc/testsuite/gcc.dg/rtl/aarch64/vec-series-2.c > new file mode 100644 > index 00000000000..17e46cbc03c > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/rtl/aarch64/vec-series-2.c > @@ -0,0 +1,35 @@ > +/* { dg-do compile { target aarch64*-*-* } } */ > +/* { dg-options "-O2 -msve-vector-bits=256 -mlittle-endian" } */ > + > +#include <arm_sve.h> > + > +#pragma GCC target "+sve" > + > +svint64x2_t __RTL (startwith ("vregs")) foo () > +{ > + (function "foo" > + (insn-chain > + (block 2 > + (edge-from entry (flags "FALLTHRU")) > + (cnote 1 [bb 2] NOTE_INSN_BASIC_BLOCK) > + (cnote 2 NOTE_INSN_FUNCTION_BEG) > + (insn 3 (set (reg:VNx4DI <0>) > + (const_vector:VNx4DI [(const_int -16) > + (const_int -15) > + (const_int -14) > + (const_int -13) > + (const_int -12) > + (const_int -11) > + (const_int -10) > + (const_int -9)]))) > + (insn 4 (set (reg:VNx4DI v0) (reg:VNx4DI <0>))) > + (insn 5 (use (reg:VNx4DI v0))) > + (edge-to exit (flags "FALLTHRU")) > + ) ;; block 2 > + ) ;; insn-chain > + (crtl (return_rtx (reg:VNx4DI v0))) > + ) ;; function > +} > + > +/* { dg-final { scan-assembler {\tindex\tz0\.d, #-16, #1\n} } } */ > +/* { dg-final { scan-assembler {\tindex\tz1\.d, #-12, #1\n} } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c > index 218a6601337..13ebb9fd6fe 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c > @@ -10,6 +10,6 @@ dupq (int x) > return svdupq_s32 (x, 1, 2, 3); > } > > -/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */ > +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1\n} } } */ > /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */ > /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c > index cbee6f27b62..13d27e2781d 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c > @@ -10,6 +10,6 @@ dupq (int x) > return svdupq_s32 (0, 1, x, 3); > } > > -/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */ > +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1\n} } } */ > /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */ > /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c > b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c > index 25910dbfa1f..5100a87c0d9 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2" } */ > +/* { dg-options "-O2 -mlittle-endian" } */ > /* { dg-final { check-function-bodies "**" "" "" } } */ > > typedef char v16qi __attribute__ ((vector_size (16))); > @@ -8,7 +8,7 @@ typedef short v8hi __attribute__ ((vector_size (16))); > typedef short v4hi __attribute__ ((vector_size (8))); > typedef int v4si __attribute__ ((vector_size (16))); > typedef int v2si __attribute__ ((vector_size (8))); > -typedef long v2di __attribute__ ((vector_size (16))); > +typedef long long v2di __attribute__ ((vector_size (16))); > > /* > ** f_v16qi: > @@ -97,3 +97,113 @@ g_v4si (void) > { > return (v4si){ 3, -1, -5, -9 }; > } > + > +/* > +** g_min_1: > +** index z0\.s, #-16, #1 > +** ret > +*/ > +v4si > +g_min_1 (void) > +{ > + return (v4si){ -16, -15, -14, -13 }; > +} > + > +/* > +** g_min_min: > +** index z0\.s, #-16, #-16 > +** ret > +*/ > +v4si > +g_min_min (void) > +{ > + return (v4si){ -16, -32, -48, -64 }; > +} > + > +/* > +** g_min_max: > +** index z0\.s, #-16, #15 > +** ret > +*/ > +v4si > +g_min_max (void) > +{ > + return (v4si){ -16, -1, 14, 29 }; > +} > + > +/* > +** g_max_1: > +** index z0\.s, #15, #1 > +** ret > +*/ > +v4si > +g_max_1 (void) > +{ > + return (v4si){ 15, 16, 17, 18 }; > +} > + > +/* > +** g_max_min: > +** index z0\.s, #15, #-16 > +** ret > +*/ > +v4si > +g_max_min (void) > +{ > + return (v4si){ 15, -1, -17, -33 }; > +} > + > +/* > +** g_max_max: > +** index z0\.s, #15, #15 > +** ret > +*/ > +v4si > +g_max_max (void) > +{ > + return (v4si){ 15, 30, 45, 60 }; > +} > + > +/* > +** g_ob_1: > +** ((?!index).)* > +** ret > +*/ > +v4si > +g_ob_1 (void) > +{ > + return (v4si){ -17, -16, -15, -14 }; > +} > + > +/* > +** g_ob_2: > +** ((?!index).)* > +** ret > +*/ > +v4si > +g_ob_2 (void) > +{ > + return (v4si){ 16, 17, 18, 19 }; > +} > + > +/* > +** g_ob_3: > +** ((?!index).)* > +** ret > +*/ > +v4si > +g_ob_3 (void) > +{ > + return (v4si){ 0, -17, -34, -51 }; > +} > + > +/* > +** g_ob_4: > +** ((?!index).)* > +** ret > +*/ > +v4si > +g_ob_4 (void) > +{ > + return (v4si){ 0, 16, 32, 48 }; > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_init_4.c > b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_4.c > new file mode 100644 > index 00000000000..0681d959101 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_4.c > @@ -0,0 +1,209 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mbig-endian" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +typedef char v16qi __attribute__ ((vector_size (16))); > +typedef char v8qi __attribute__ ((vector_size (8))); > +typedef short v8hi __attribute__ ((vector_size (16))); > +typedef short v4hi __attribute__ ((vector_size (8))); > +typedef int v4si __attribute__ ((vector_size (16))); > +typedef int v2si __attribute__ ((vector_size (8))); > +typedef long long v2di __attribute__ ((vector_size (16))); > + > +/* > +** f_v16qi: > +** index z0\.b, #15, #-1 > +** ret > +*/ > +v16qi > +f_v16qi (void) > +{ > + return (v16qi){ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; > +} > + > +/* > +** f_v8qi: > +** index z0\.b, #7, #-1 > +** ret > +*/ > +v8qi > +f_v8qi (void) > +{ > + return (v8qi){ 0, 1, 2, 3, 4, 5, 6, 7 }; > +} > + > +/* > +** f_v8hi: > +** index z0\.h, #7, #-1 > +** ret > +*/ > +v8hi > +f_v8hi (void) > +{ > + return (v8hi){ 0, 1, 2, 3, 4, 5, 6, 7 }; > +} > + > +/* > +** f_v4hi: > +** index z0\.h, #3, #-1 > +** ret > +*/ > +v4hi > +f_v4hi (void) > +{ > + return (v4hi){ 0, 1, 2, 3 }; > +} > + > +/* > +** f_v4si: > +** index z0\.s, #3, #-1 > +** ret > +*/ > +v4si > +f_v4si (void) > +{ > + return (v4si){ 0, 1, 2, 3 }; > +} > + > +/* > +** f_v2si: > +** index z0\.s, #1, #-1 > +** ret > +*/ > +v2si > +f_v2si (void) > +{ > + return (v2si){ 0, 1 }; > +} > + > +/* > +** f_v2di: > +** index z0\.d, #1, #-1 > +** ret > +*/ > +v2di > +f_v2di (void) > +{ > + return (v2di){ 0, 1 }; > +} > + > +/* > +** g_v4si: > +** index z0\.s, #-9, #4 > +** ret > +*/ > +v4si > +g_v4si (void) > +{ > + return (v4si){ 3, -1, -5, -9 }; > +} > + > +/* > +** g_min_1: > +** index z0\.s, #-16, #1 > +** ret > +*/ > +v4si > +g_min_1 (void) > +{ > + return (v4si){ -13, -14, -15, -16 }; > +} > + > +/* > +** g_min_min: > +** index z0\.s, #-16, #-16 > +** ret > +*/ > +v4si > +g_min_min (void) > +{ > + return (v4si){ -64, -48, -32, -16 }; > +} > + > +/* > +** g_min_max: > +** index z0\.s, #-16, #15 > +** ret > +*/ > +v4si > +g_min_max (void) > +{ > + return (v4si){ 29, 14, -1, -16 }; > +} > + > +/* > +** g_max_1: > +** index z0\.s, #15, #1 > +** ret > +*/ > +v4si > +g_max_1 (void) > +{ > + return (v4si){ 18, 17, 16, 15 }; > +} > + > +/* > +** g_max_min: > +** index z0\.s, #15, #-16 > +** ret > +*/ > +v4si > +g_max_min (void) > +{ > + return (v4si){ -33, -17, -1, 15 }; > +} > + > +/* > +** g_max_max: > +** index z0\.s, #15, #15 > +** ret > +*/ > +v4si > +g_max_max (void) > +{ > + return (v4si){ 60, 45, 30, 15 }; > +} > + > +/* > +** g_ob_1: > +** ((?!index).)* > +** ret > +*/ > +v4si > +g_ob_1 (void) > +{ > + return (v4si){ -14, -15, -16, -17 }; > +} > + > +/* > +** g_ob_2: > +** ((?!index).)* > +** ret > +*/ > +v4si > +g_ob_2 (void) > +{ > + return (v4si){ 19, 18, 17, 16 }; > +} > + > +/* > +** g_ob_3: > +** ((?!index).)* > +** ret > +*/ > +v4si > +g_ob_3 (void) > +{ > + return (v4si){ -51, -34, -17, 0 }; > +} > + > +/* > +** g_ob_4: > +** ((?!index).)* > +** ret > +*/ > +v4si > +g_ob_4 (void) > +{ > + return (v4si){ 48, 32, 16, 0 }; > +} > -- > 2.43.0 >