Re: [PATCH] aarch64: Extend HVLA permutations to big-endian

Andrew Pinski Wed, 09 Jul 2025 11:37:32 -0700

On Wed, Jul 9, 2025 at 7:07 AM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> TARGET_VECTORIZE_VEC_PERM_CONST has code to match the SVE2.1
> "hybrid VLA" DUPQ, EXTQ, UZPQ{1,2}, and ZIPQ{1,2} instructions.
> This matching was conditional on !BYTES_BIG_ENDIAN.
>
> The ACLE code also lowered the associated SVE2.1 intrinsics into
> suitable VEC_PERM_EXPRs.  This lowering was not conditional on
> !BYTES_BIG_ENDIAN.
>
> The mismatch led to lots of ICEs in the ACLE tests on big-endian
> targets: we lowered to VEC_PERM_EXPRs that are not supported.
>
> I think the !BYTES_BIG_ENDIAN restriction was unnecessary.
> SVE maps the first memory element to the least significant end of
> the register for both endiannesses, so no endian correction or lane
> number adjustment is necessary.
>
> This is in some ways a bit counterintuitive.  ZIPQ1 is conceptually
> "apply Advanced SIMD ZIP1 to each 128-bit block" and endianness does
> matter when choosing between Advanced SIMD ZIP1 and ZIP2.  For example,
> the V4SI permute selector { 0, 4, 1, 5 } corresponds to ZIP1 for little-
> endian and ZIP2 for big-endian.  But the difference between the hybrid
> VLA and Advanced SIMD permute selectors is a consequence of the
> difference between the SVE and Advanced SIMD element orders.
>
> The same thing applies to ACLE intrinsics.  The current lowering of
> svzipq1 etc. is correct for both endiannesses.  If ACLE code does:
>
>   2x svld1_s32 + svzipq1_s32 + svst1_s32
>
> then the byte-for-byte result is the same for both endiannesses.
> On big-endian targets, this is different from using the Advanced SIMD
> sequence below for each 128-bit block:
>
>   2x LDR + ZIP1 + STR
>
> In contrast, the byte-for-byte result of:
>
>   2x svld1q_gather_s32 + svzipq1_s32 + svst11_scatter_s32
>
> depends on endianness, since the quadword gathers and scatters use
> Advanced SIMD byte ordering for each 128-bit block.  This gather/scatter
> sequence behaves in the same way as the Advanced SIMD LDR+ZIP1+STR
> sequence for both endiannesses.
>
> Programmers writing ACLE code have to be aware of this difference
> if they want to support both endiannesses.
>
> The patch includes some new execution tests to verify the expansion
> of the VEC_PERM_EXPRs.
>
> Tested on aarch64-linux-gnu and aarch64_be-elf.  OK to install?


Ok.

>
> Richard
>
>
> gcc/
>         * doc/sourcebuild.texi (aarch64_sve2_hw, aarch64_sve2p1_hw): Document.
>         * config/aarch64/aarch64.cc (aarch64_evpc_hvla): Extend to
>         BYTES_BIG_ENDIAN.
>
> gcc/testsuite/
>         * lib/target-supports.exp (check_effective_target_aarch64_sve2p1_hw):
>         New proc.
>         * gcc.target/aarch64/sve2/dupq_1.c: Extend to big-endian.  Add
>         noipa attributes.
>         * gcc.target/aarch64/sve2/extq_1.c: Likewise.
>         * gcc.target/aarch64/sve2/uzpq_1.c: Likewise.
>         * gcc.target/aarch64/sve2/zipq_1.c: Likewise.
>         * gcc.target/aarch64/sve2/dupq_1_run.c: New test.
>         * gcc.target/aarch64/sve2/extq_1_run.c: Likewise.
>         * gcc.target/aarch64/sve2/uzpq_1_run.c: Likewise.
>         * gcc.target/aarch64/sve2/zipq_1_run.c: Likewise.
> ---
>  gcc/config/aarch64/aarch64.cc                 |  1 -
>  gcc/doc/sourcebuild.texi                      |  6 ++
>  .../gcc.target/aarch64/sve2/dupq_1.c          | 26 +++---
>  .../gcc.target/aarch64/sve2/dupq_1_run.c      | 87 +++++++++++++++++++
>  .../gcc.target/aarch64/sve2/extq_1.c          |  2 +-
>  .../gcc.target/aarch64/sve2/extq_1_run.c      | 73 ++++++++++++++++
>  .../gcc.target/aarch64/sve2/uzpq_1.c          |  2 +-
>  .../gcc.target/aarch64/sve2/uzpq_1_run.c      | 78 +++++++++++++++++
>  .../gcc.target/aarch64/sve2/zipq_1.c          |  2 +-
>  .../gcc.target/aarch64/sve2/zipq_1_run.c      | 78 +++++++++++++++++
>  gcc/testsuite/lib/target-supports.exp         | 17 ++++
>  11 files changed, 355 insertions(+), 17 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/dupq_1_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/extq_1_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/zipq_1_run.c
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 7960b639f90..ce25f4f6f9f 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -26752,7 +26752,6 @@ aarch64_evpc_hvla (struct expand_vec_perm_d *d)
>    machine_mode vmode = d->vmode;
>    if (!TARGET_SVE2p1
>        || !TARGET_NON_STREAMING
> -      || BYTES_BIG_ENDIAN
>        || d->vec_flags != VEC_SVE_DATA
>        || GET_MODE_UNIT_BITSIZE (vmode) > 64)
>      return false;
> diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
> index 6c5586e4b03..85fb810d96c 100644
> --- a/gcc/doc/sourcebuild.texi
> +++ b/gcc/doc/sourcebuild.texi
> @@ -2373,6 +2373,12 @@ whether it does so by default).
>  @itemx aarch64_sve1024_hw
>  @itemx aarch64_sve2048_hw
>  Like @code{aarch64_sve_hw}, but also test for an exact hardware vector 
> length.
> +@item aarch64_sve2_hw
> +AArch64 target that is able to generate and execute SVE2 code (regardless of
> +whether it does so by default).
> +@item aarch64_sve2p1_hw
> +AArch64 target that is able to generate and execute SVE2.1 code (regardless 
> of
> +whether it does so by default).
>
>  @item aarch64_fjcvtzs_hw
>  AArch64 target that is able to generate and execute armv8.3-a FJCVTZS
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1.c
> index 5472e30f812..9db60b1ea4f 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1.c
> @@ -1,5 +1,5 @@
>  /* { dg-options "-O2 -msve-vector-bits=256" } */
> -/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
>
>  #include <arm_sve.h>
>
> @@ -15,7 +15,7 @@ typedef svuint64_t fixed_uint64_t 
> __attribute__((arm_sve_vector_bits(256)));
>  **     trn1    z0\.d, z0\.d, z0\.d
>  **     ret
>  */
> -fixed_uint64_t
> +[[gnu::noipa]] fixed_uint64_t
>  f1 (fixed_uint64_t z0)
>  {
>    return __builtin_shufflevector (z0, z0, 0, 0, 2, 2);
> @@ -26,7 +26,7 @@ f1 (fixed_uint64_t z0)
>  **     trn2    z0\.d, z0\.d, z0\.d
>  **     ret
>  */
> -fixed_uint64_t
> +[[gnu::noipa]] fixed_uint64_t
>  f2 (fixed_uint64_t z0)
>  {
>    return __builtin_shufflevector (z0, z0, 1, 1, 3, 3);
> @@ -37,7 +37,7 @@ f2 (fixed_uint64_t z0)
>  **     dupq    z0\.s, z0\.s\[0\]
>  **     ret
>  */
> -fixed_int32_t
> +[[gnu::noipa]] fixed_int32_t
>  f3 (fixed_int32_t z0)
>  {
>    return __builtin_shufflevector (z0, z0, 0, 0, 0, 0, 4, 4, 4, 4);
> @@ -48,7 +48,7 @@ f3 (fixed_int32_t z0)
>  **     dupq    z0\.s, z0\.s\[1\]
>  **     ret
>  */
> -fixed_int32_t
> +[[gnu::noipa]] fixed_int32_t
>  f4 (fixed_int32_t z0)
>  {
>    return __builtin_shufflevector (z0, z0, 1, 1, 1, 1, 5, 5, 5, 5);
> @@ -59,7 +59,7 @@ f4 (fixed_int32_t z0)
>  **     dupq    z0\.s, z0\.s\[2\]
>  **     ret
>  */
> -fixed_int32_t
> +[[gnu::noipa]] fixed_int32_t
>  f5 (fixed_int32_t z0)
>  {
>    return __builtin_shufflevector (z0, z0, 2, 2, 2, 2, 6, 6, 6, 6);
> @@ -70,7 +70,7 @@ f5 (fixed_int32_t z0)
>  **     dupq    z0\.s, z0\.s\[3\]
>  **     ret
>  */
> -fixed_int32_t
> +[[gnu::noipa]] fixed_int32_t
>  f6 (fixed_int32_t z0)
>  {
>    return __builtin_shufflevector (z0, z0, 3, 3, 3, 3, 7, 7, 7, 7);
> @@ -81,7 +81,7 @@ f6 (fixed_int32_t z0)
>  **     dupq    z0\.h, z0\.h\[0\]
>  **     ret
>  */
> -fixed_uint16_t
> +[[gnu::noipa]] fixed_uint16_t
>  f7 (fixed_uint16_t z0)
>  {
>    return __builtin_shufflevector (z0, z0,
> @@ -95,7 +95,7 @@ f7 (fixed_uint16_t z0)
>  **     dupq    z0\.h, z0\.h\[5\]
>  **     ret
>  */
> -fixed_uint16_t
> +[[gnu::noipa]] fixed_uint16_t
>  f8 (fixed_uint16_t z0)
>  {
>    return __builtin_shufflevector (z0, z0,
> @@ -108,7 +108,7 @@ f8 (fixed_uint16_t z0)
>  **     dupq    z0\.h, z0\.h\[7\]
>  **     ret
>  */
> -fixed_uint16_t
> +[[gnu::noipa]] fixed_uint16_t
>  f9 (fixed_uint16_t z0)
>  {
>    return __builtin_shufflevector (z0, z0,
> @@ -121,7 +121,7 @@ f9 (fixed_uint16_t z0)
>  **     dupq    z0\.b, z0\.b\[0\]
>  **     ret
>  */
> -fixed_uint8_t
> +[[gnu::noipa]] fixed_uint8_t
>  f10 (fixed_uint8_t z0)
>  {
>    return __builtin_shufflevector (z0, z0,
> @@ -136,7 +136,7 @@ f10 (fixed_uint8_t z0)
>  **     dupq    z0\.b, z0\.b\[13\]
>  **     ret
>  */
> -fixed_uint8_t
> +[[gnu::noipa]] fixed_uint8_t
>  f11 (fixed_uint8_t z0)
>  {
>    return __builtin_shufflevector (z0, z0,
> @@ -151,7 +151,7 @@ f11 (fixed_uint8_t z0)
>  **     dupq    z0\.b, z0\.b\[15\]
>  **     ret
>  */
> -fixed_uint8_t
> +[[gnu::noipa]] fixed_uint8_t
>  f12 (fixed_uint8_t z0)
>  {
>    return __builtin_shufflevector (z0, z0,
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1_run.c
> new file mode 100644
> index 00000000000..fd25034c4b4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1_run.c
> @@ -0,0 +1,87 @@
> +/* { dg-do run { target { aarch64_sve256_hw && aarch64_sve2p1_hw } } } */
> +/* { dg-options "-O2 -msve-vector-bits=256" } */
> +
> +#include "dupq_1.c"
> +
> +#define TEST(A, B)                                                     \
> +  do {                                                                 \
> +    typeof(B) actual_ = (A);                                           \
> +    if (__builtin_memcmp (&actual_, &(B), sizeof (actual_)) != 0)      \
> +      __builtin_abort ();                                              \
> +  } while (0)
> +
> +int
> +main ()
> +{
> +  fixed_uint64_t a64 = { 0x1122, -1, 0x5566, -2 };
> +  fixed_int32_t a32 = { 0x1122, -0x3344, 0x5566, -0x7788,
> +                       0x99aa, -0xbbcc, 0xddee, -0xff00 };
> +  fixed_uint16_t a16 = { 0x9a12, 0xbc34, 0xde56, 0xf078,
> +                        0x00ff, 0x11ee, 0x22dd, 0x33cc,
> +                        0x44bb, 0x55aa, 0x6699, 0x7788,
> +                        0xfe01, 0xdc23, 0xba45, 0x9867 };
> +  fixed_uint8_t a8 = { 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x70,
> +                      0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf8,
> +                      0xfe, 0xed, 0xdc, 0xcb, 0xba, 0xa9, 0x98, 0x8f,
> +                      0x76, 0x65, 0x54, 0x43, 0x32, 0x21, 0x10, 0x07 };
> +
> +  fixed_uint64_t expected1 = { 0x1122, 0x1122, 0x5566, 0x5566 };
> +  TEST (f1 (a64), expected1);
> +
> +  fixed_uint64_t expected2 = { -1, -1, -2, -2 };
> +  TEST (f2 (a64), expected2);
> +
> +  fixed_int32_t expected3 = { 0x1122, 0x1122, 0x1122, 0x1122,
> +                              0x99aa, 0x99aa, 0x99aa, 0x99aa };
> +  TEST (f3 (a32), expected3);
> +
> +  fixed_int32_t expected4 = { -0x3344, -0x3344, -0x3344, -0x3344,
> +                             -0xbbcc, -0xbbcc, -0xbbcc, -0xbbcc };
> +  TEST (f4 (a32), expected4);
> +
> +  fixed_int32_t expected5 = { 0x5566, 0x5566, 0x5566, 0x5566,
> +                              0xddee, 0xddee, 0xddee, 0xddee };
> +  TEST (f5 (a32), expected5);
> +
> +  fixed_int32_t expected6 = { -0x7788, -0x7788, -0x7788, -0x7788,
> +                             -0xff00, -0xff00, -0xff00, -0xff00 };
> +  TEST (f6 (a32), expected6);
> +
> +  fixed_uint16_t expected7 = { 0x9a12, 0x9a12, 0x9a12, 0x9a12,
> +                              0x9a12, 0x9a12, 0x9a12, 0x9a12,
> +                              0x44bb, 0x44bb, 0x44bb, 0x44bb,
> +                              0x44bb, 0x44bb, 0x44bb, 0x44bb };
> +  TEST (f7 (a16), expected7);
> +
> +  fixed_uint16_t expected8 = { 0x11ee, 0x11ee, 0x11ee, 0x11ee,
> +                              0x11ee, 0x11ee, 0x11ee, 0x11ee,
> +                              0xdc23, 0xdc23, 0xdc23, 0xdc23,
> +                              0xdc23, 0xdc23, 0xdc23, 0xdc23 };
> +  TEST (f8 (a16), expected8);
> +
> +  fixed_uint16_t expected9 = { 0x33cc, 0x33cc, 0x33cc, 0x33cc,
> +                              0x33cc, 0x33cc, 0x33cc, 0x33cc,
> +                              0x9867, 0x9867, 0x9867, 0x9867,
> +                              0x9867, 0x9867, 0x9867, 0x9867 };
> +  TEST (f9 (a16), expected9);
> +
> +  fixed_uint8_t expected10 = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 
> 0x01,
> +                              0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
> +                              0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
> +                              0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe 
> };
> +  TEST (f10 (a8), expected10);
> +
> +  fixed_uint8_t expected11 = { 0xde, 0xde, 0xde, 0xde, 0xde, 0xde, 0xde, 
> 0xde,
> +                              0xde, 0xde, 0xde, 0xde, 0xde, 0xde, 0xde, 0xde,
> +                              0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21,
> +                              0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21 
> };
> +  TEST (f11 (a8), expected11);
> +
> +  fixed_uint8_t expected12 = { 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 
> 0xf8,
> +                              0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
> +                              0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
> +                              0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 
> };
> +  TEST (f12 (a8), expected12);
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/extq_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/extq_1.c
> index 03c5fb143f7..b8ce89c9576 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/extq_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/extq_1.c
> @@ -1,5 +1,5 @@
>  /* { dg-options "-O2 -msve-vector-bits=256" } */
> -/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
>
>  #include <arm_sve.h>
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/extq_1_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/extq_1_run.c
> new file mode 100644
> index 00000000000..6b72c98a22c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/extq_1_run.c
> @@ -0,0 +1,73 @@
> +/* { dg-do run { target { aarch64_sve256_hw && aarch64_sve2p1_hw } } } */
> +/* { dg-options "-O2 -msve-vector-bits=256" } */
> +
> +#include "extq_1.c"
> +
> +#define TEST(A, B)                                                     \
> +  do {                                                                 \
> +    typeof(B) actual_ = (A);                                           \
> +    if (__builtin_memcmp (&actual_, &(B), sizeof (actual_)) != 0)      \
> +      __builtin_abort ();                                              \
> +  } while (0)
> +
> +int
> +main ()
> +{
> +  fixed_float64_t a64 = { 1.5, 3.75, -5.25, 9 };
> +  fixed_float64_t b64 = { -2, 4.125, -6.375, 11.5 };
> +  fixed_float64_t expected1 = { 3.75, -2, 9, -6.375 };
> +  TEST (f1 (a64, b64), expected1);
> +
> +  fixed_uint32_t a32 = { 0x1122, -0x3344, 0x5566, -0x7788,
> +                        0x99aa, -0xbbcc, 0xddee, -0xff00 };
> +  fixed_uint32_t b32 = { 1 << 20, 1 << 21, 1 << 22, 1 << 23,
> +                        5 << 6, 5 << 7, 5 << 8, 5 << 9 };
> +  fixed_uint32_t expected2 = { -0x3344, 0x5566, -0x7788, 1 << 20,
> +                              -0xbbcc, 0xddee, -0xff00, 5 << 6 };
> +  fixed_uint32_t expected3 = { -0x7788, 1 << 20, 1 << 21, 1 << 22,
> +                              -0xff00, 5 << 6, 5 << 7, 5 << 8 };
> +  TEST (f2 (a32, b32), expected2);
> +  TEST (f3 (a32, b32), expected3);
> +
> +  fixed_float16_t a16 = { 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25,
> +                         2.5, 2.75, 3, 3.25, 3.5, 3.75, 4, 4.25 };
> +  fixed_float16_t b16 = { -0.5, -0.75, -1, -1.25, -1.5, -1.75, -2, -2.25,
> +                         -2.5, -2.75, -3, -3.25, -3.5, -3.75, -4, -4.25 };
> +  fixed_float16_t expected4 = { 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, -0.5,
> +                               2.75, 3, 3.25, 3.5, 3.75, 4, 4.25, -2.5 };
> +  fixed_float16_t expected5 = { 1.75, 2, 2.25, -0.5, -0.75, -1, -1.25, -1.5,
> +                               3.75, 4, 4.25, -2.5, -2.75, -3, -3.25, -3.5 };
> +  fixed_float16_t expected6 = { 2.25, -0.5, -0.75, -1,
> +                               -1.25, -1.5, -1.75, -2,
> +                               4.25, -2.5, -2.75, -3,
> +                               -3.25, -3.5, -3.75, -4 };
> +  TEST (f4 (a16, b16), expected4);
> +  TEST (f5 (a16, b16), expected5);
> +  TEST (f6 (a16, b16), expected6);
> +
> +  fixed_int8_t a8 = { 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x70,
> +                     0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf8,
> +                     0xfe, 0xed, 0xdc, 0xcb, 0xba, 0xa9, 0x98, 0x8f,
> +                     0x76, 0x65, 0x54, 0x43, 0x32, 0x21, 0x10, 0x07 };
> +  fixed_int8_t b8 = { 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88,
> +                     0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00,
> +                     0x13, 0x24, 0x35, 0x46, 0x57, 0x68, 0x79, 0x8a,
> +                     0x9b, 0xac, 0xbd, 0xce, 0xdf, 0xe0, 0xf1, 0x02 };
> +  fixed_int8_t expected7 = { 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x70, 0x89,
> +                            0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf8, 0x11,
> +                            0xed, 0xdc, 0xcb, 0xba, 0xa9, 0x98, 0x8f, 0x76,
> +                            0x65, 0x54, 0x43, 0x32, 0x21, 0x10, 0x07, 0x13 };
> +  fixed_int8_t expected8 = { 0xbc, 0xcd, 0xde, 0xef, 0xf8, 0x11, 0x22, 0x33,
> +                            0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb,
> +                            0x43, 0x32, 0x21, 0x10, 0x07, 0x13, 0x24, 0x35,
> +                            0x46, 0x57, 0x68, 0x79, 0x8a, 0x9b, 0xac, 0xbd };
> +  fixed_int8_t expected9 = { 0xf8, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
> +                            0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
> +                            0x07, 0x13, 0x24, 0x35, 0x46, 0x57, 0x68, 0x79,
> +                            0x8a, 0x9b, 0xac, 0xbd, 0xce, 0xdf, 0xe0, 0xf1 };
> +  TEST (f7 (a8, b8), expected7);
> +  TEST (f8 (a8, b8), expected8);
> +  TEST (f9 (a8, b8), expected9);
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1.c
> index f923e9447ec..47a663e94c2 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1.c
> @@ -1,5 +1,5 @@
>  /* { dg-options "-O2 -msve-vector-bits=256" } */
> -/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
>
>  #include <arm_sve.h>
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1_run.c
> new file mode 100644
> index 00000000000..9044cae659b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1_run.c
> @@ -0,0 +1,78 @@
> +/* { dg-do run { target { aarch64_sve256_hw && aarch64_sve2p1_hw } } } */
> +/* { dg-options "-O2 -msve-vector-bits=256" } */
> +
> +#include "uzpq_1.c"
> +
> +typedef svuint16_t fixed_uint16_t __attribute__((arm_sve_vector_bits(256)));
> +
> +#define TEST(A, B)                                                     \
> +  do {                                                                 \
> +    typeof(A) actual_ = (A);                                           \
> +    if (__builtin_memcmp (&actual_, &(B), sizeof (actual_)) != 0)      \
> +      __builtin_abort ();                                              \
> +  } while (0)
> +
> +int
> +main ()
> +{
> +  fixed_int64_t a64 = { 0x1122LL << 31, -1LL << 47, 0x5566 << 15, -2 };
> +  fixed_int64_t b64 = { 42, -0x3344LL << 19, 303, -0x7788LL << 27 };
> +  fixed_int64_t expected1 = { 0x1122LL << 31, 42,
> +                             0x5566 << 15, 303 };
> +  fixed_int64_t expected2 = { -1LL << 47, -0x3344LL << 19,
> +                             -2, -0x7788LL << 27 };
> +  TEST (f1 (a64, b64), expected1);
> +  TEST (f2 (a64, b64), expected2);
> +
> +  fixed_float32_t a32 = { 0.5, 0.75, 1, 1.25, 2.5, 2.75, 3, 3.25 };
> +  fixed_float32_t b32 = { -0.5, -0.75, -1, -1.25, -2.5, -2.75, -3, -3.25 };
> +  fixed_float32_t expected3 = { 0.5, 1, -0.5, -1,
> +                               2.5, 3, -2.5, -3 };
> +  fixed_float32_t expected4 = { 0.75, 1.25, -0.75, -1.25,
> +                               2.75, 3.25, -2.75, -3.25 };
> +  TEST (f3 (a32, b32), expected3);
> +  TEST (f4 (a32, b32), expected4);
> +
> +  fixed_uint16_t a16_i = { 0x9a12, 0xbc34, 0xde56, 0xf078,
> +                          0x00ff, 0x11ee, 0x22dd, 0x33cc,
> +                          0x44bb, 0x55aa, 0x6699, 0x7788,
> +                          0xfe01, 0xdc23, 0xba45, 0x9867 };
> +  fixed_uint16_t b16_i = { 0x1010, 0x2020, 0x3030, 0x4040,
> +                          0x5050, 0x6060, 0x7070, 0x8080,
> +                          0x9090, 0xa0a0, 0xb0b0, 0xc0c0,
> +                          0xd0d0, 0xe0e0, 0xf0f0, 0x0f0f };
> +  fixed_uint16_t expected5 = { 0x9a12, 0xde56, 0x00ff, 0x22dd,
> +                              0x1010, 0x3030, 0x5050, 0x7070,
> +                              0x44bb, 0x6699, 0xfe01, 0xba45,
> +                              0x9090, 0xb0b0, 0xd0d0, 0xf0f0 };
> +  fixed_uint16_t expected6 = { 0xbc34, 0xf078, 0x11ee, 0x33cc,
> +                              0x2020, 0x4040, 0x6060, 0x8080,
> +                              0x55aa, 0x7788, 0xdc23, 0x9867,
> +                              0xa0a0, 0xc0c0, 0xe0e0, 0x0f0f };
> +  fixed_bfloat16_t a16, b16;
> +  __builtin_memcpy (&a16, &a16_i, sizeof (a16));
> +  __builtin_memcpy (&b16, &b16_i, sizeof (b16));
> +  TEST (f5 (a16, b16), expected5);
> +  TEST (f6 (a16, b16), expected6);
> +
> +  fixed_uint8_t a8 = { 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x70,
> +                      0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf8,
> +                      0xfe, 0xed, 0xdc, 0xcb, 0xba, 0xa9, 0x98, 0x8f,
> +                      0x76, 0x65, 0x54, 0x43, 0x32, 0x21, 0x10, 0x07 };
> +  fixed_uint8_t b8 = { 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88,
> +                      0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00,
> +                      0x13, 0x24, 0x35, 0x46, 0x57, 0x68, 0x79, 0x8a,
> +                      0x9b, 0xac, 0xbd, 0xce, 0xdf, 0xe0, 0xf1, 0x02 };
> +  fixed_uint8_t expected7 = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef,
> +                             0x11, 0x33, 0x55, 0x77, 0x99, 0xbb, 0xdd, 0xff,
> +                             0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
> +                             0x13, 0x35, 0x57, 0x79, 0x9b, 0xbd, 0xdf, 0xf1 
> };
> +  fixed_uint8_t expected8 = { 0x12, 0x34, 0x56, 0x70, 0x9a, 0xbc, 0xde, 0xf8,
> +                             0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x00,
> +                             0xed, 0xcb, 0xa9, 0x8f, 0x65, 0x43, 0x21, 0x07,
> +                             0x24, 0x46, 0x68, 0x8a, 0xac, 0xce, 0xe0, 0x02 
> };
> +  TEST (f7 (a8, b8), expected7);
> +  TEST (f8 (a8, b8), expected8);
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1.c
> index fa420a959c7..482f2e8b829 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1.c
> @@ -1,5 +1,5 @@
>  /* { dg-options "-O2 -msve-vector-bits=256" } */
> -/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
>
>  #include <arm_sve.h>
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1_run.c
> new file mode 100644
> index 00000000000..211f9d945ed
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1_run.c
> @@ -0,0 +1,78 @@
> +/* { dg-do run { target { aarch64_sve256_hw && aarch64_sve2p1_hw } } } */
> +/* { dg-options "-O2 -msve-vector-bits=256" } */
> +
> +#include "zipq_1.c"
> +
> +typedef svuint16_t fixed_uint16_t __attribute__((arm_sve_vector_bits(256)));
> +
> +#define TEST(A, B)                                                     \
> +  do {                                                                 \
> +    typeof(A) actual_ = (A);                                           \
> +    if (__builtin_memcmp (&actual_, &(B), sizeof (actual_)) != 0)      \
> +      __builtin_abort ();                                              \
> +  } while (0)
> +
> +int
> +main ()
> +{
> +  fixed_int64_t a64 = { 0x1122LL << 31, -1LL << 47, 0x5566 << 15, -2 };
> +  fixed_int64_t b64 = { 42, -0x3344LL << 19, 303, -0x7788LL << 27 };
> +  fixed_int64_t expected1 = { 0x1122LL << 31, 42,
> +                             0x5566 << 15, 303 };
> +  fixed_int64_t expected2 = { -1LL << 47, -0x3344LL << 19,
> +                             -2, -0x7788LL << 27 };
> +  TEST (f1 (a64, b64), expected1);
> +  TEST (f2 (a64, b64), expected2);
> +
> +  fixed_float32_t a32 = { 0.5, 0.75, 1, 1.25, 2.5, 2.75, 3, 3.25 };
> +  fixed_float32_t b32 = { -0.5, -0.75, -1, -1.25, -2.5, -2.75, -3, -3.25 };
> +  fixed_float32_t expected3 = { 0.5, -0.5, 0.75, -0.75,
> +                               2.5, -2.5, 2.75, -2.75 };
> +  fixed_float32_t expected4 = { 1, -1, 1.25, -1.25,
> +                               3, -3, 3.25, -3.25 };
> +  TEST (f3 (a32, b32), expected3);
> +  TEST (f4 (a32, b32), expected4);
> +
> +  fixed_uint16_t a16_i = { 0x9a12, 0xbc34, 0xde56, 0xf078,
> +                          0x00ff, 0x11ee, 0x22dd, 0x33cc,
> +                          0x44bb, 0x55aa, 0x6699, 0x7788,
> +                          0xfe01, 0xdc23, 0xba45, 0x9867 };
> +  fixed_uint16_t b16_i = { 0x1010, 0x2020, 0x3030, 0x4040,
> +                          0x5050, 0x6060, 0x7070, 0x8080,
> +                          0x9090, 0xa0a0, 0xb0b0, 0xc0c0,
> +                          0xd0d0, 0xe0e0, 0xf0f0, 0x0f0f };
> +  fixed_uint16_t expected5 = { 0x9a12, 0x1010, 0xbc34, 0x2020,
> +                              0xde56, 0x3030, 0xf078, 0x4040,
> +                              0x44bb, 0x9090, 0x55aa, 0xa0a0,
> +                              0x6699, 0xb0b0, 0x7788, 0xc0c0 };
> +  fixed_uint16_t expected6 = { 0x00ff, 0x5050, 0x11ee, 0x6060,
> +                              0x22dd, 0x7070, 0x33cc, 0x8080,
> +                              0xfe01, 0xd0d0, 0xdc23, 0xe0e0,
> +                              0xba45, 0xf0f0, 0x9867, 0x0f0f };
> +  fixed_bfloat16_t a16, b16;
> +  __builtin_memcpy (&a16, &a16_i, sizeof (a16));
> +  __builtin_memcpy (&b16, &b16_i, sizeof (b16));
> +  TEST (f5 (a16, b16), expected5);
> +  TEST (f6 (a16, b16), expected6);
> +
> +  fixed_uint8_t a8 = { 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x70,
> +                      0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf8,
> +                      0xfe, 0xed, 0xdc, 0xcb, 0xba, 0xa9, 0x98, 0x8f,
> +                      0x76, 0x65, 0x54, 0x43, 0x32, 0x21, 0x10, 0x07 };
> +  fixed_uint8_t b8 = { 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88,
> +                      0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00,
> +                      0x13, 0x24, 0x35, 0x46, 0x57, 0x68, 0x79, 0x8a,
> +                      0x9b, 0xac, 0xbd, 0xce, 0xdf, 0xe0, 0xf1, 0x02 };
> +  fixed_uint8_t expected7 = { 0x01, 0x11, 0x12, 0x22, 0x23, 0x33, 0x34, 0x44,
> +                             0x45, 0x55, 0x56, 0x66, 0x67, 0x77, 0x70, 0x88,
> +                             0xfe, 0x13, 0xed, 0x24, 0xdc, 0x35, 0xcb, 0x46,
> +                             0xba, 0x57, 0xa9, 0x68, 0x98, 0x79, 0x8f, 0x8a 
> };
> +  fixed_uint8_t expected8 = { 0x89, 0x99, 0x9a, 0xaa, 0xab, 0xbb, 0xbc, 0xcc,
> +                             0xcd, 0xdd, 0xde, 0xee, 0xef, 0xff, 0xf8, 0x00,
> +                             0x76, 0x9b, 0x65, 0xac, 0x54, 0xbd, 0x43, 0xce,
> +                             0x32, 0xdf, 0x21, 0xe0, 0x10, 0xf1, 0x07, 0x02 
> };
> +  TEST (f7 (a8, b8), expected7);
> +  TEST (f8 (a8, b8), expected8);
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/lib/target-supports.exp 
> b/gcc/testsuite/lib/target-supports.exp
> index 956bc0bc7ca..9ab46a0eab4 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -6491,6 +6491,23 @@ proc check_effective_target_aarch64_sve2_hw { } {
>      }]
>  }
>
> +# Return true if this is an AArch64 target that can run SVE2.1 code.
> +
> +proc check_effective_target_aarch64_sve2p1_hw { } {
> +    if { ![istarget aarch64*-*-*] } {
> +       return 0
> +    }
> +    return [check_runtime aarch64_sve2p1_hw_available {
> +       #pragma GCC target "+sve2p1"
> +       int
> +       main (void)
> +       {
> +         asm volatile ("dupq z0.b, z0.b[0]");
> +         return 0;
> +       }
> +    }]
> +}
> +
>  # Return true if this is an AArch64 target that can run SVE code and
>  # if its SVE vectors have exactly BITS bits.
>
> --
> 2.43.0
>

Re: [PATCH] aarch64: Extend HVLA permutations to big-endian

Reply via email to