[gcc r16-2164] aarch64: Extend HVLA permutations to big-endian

Richard Sandiford via Gcc-cvs Thu, 10 Jul 2025 02:57:51 -0700

https://gcc.gnu.org/g:3b870131487d786a74f27a89d0415c8207770f14


commit r16-2164-g3b870131487d786a74f27a89d0415c8207770f14
Author: Richard Sandiford <richard.sandif...@arm.com>
Date:   Thu Jul 10 10:57:28 2025 +0100

    aarch64: Extend HVLA permutations to big-endian
    
    TARGET_VECTORIZE_VEC_PERM_CONST has code to match the SVE2.1
    "hybrid VLA" DUPQ, EXTQ, UZPQ{1,2}, and ZIPQ{1,2} instructions.
    This matching was conditional on !BYTES_BIG_ENDIAN.
    
    The ACLE code also lowered the associated SVE2.1 intrinsics into
    suitable VEC_PERM_EXPRs.  This lowering was not conditional on
    !BYTES_BIG_ENDIAN.
    
    The mismatch led to lots of ICEs in the ACLE tests on big-endian
    targets: we lowered to VEC_PERM_EXPRs that are not supported.
    
    I think the !BYTES_BIG_ENDIAN restriction was unnecessary.
    SVE maps the first memory element to the least significant end of
    the register for both endiannesses, so no endian correction or lane
    number adjustment is necessary.
    
    This is in some ways a bit counterintuitive.  ZIPQ1 is conceptually
    "apply Advanced SIMD ZIP1 to each 128-bit block" and endianness does
    matter when choosing between Advanced SIMD ZIP1 and ZIP2.  For example,
    the V4SI permute selector { 0, 4, 1, 5 } corresponds to ZIP1 for little-
    endian and ZIP2 for big-endian.  But the difference between the hybrid
    VLA and Advanced SIMD permute selectors is a consequence of the
    difference between the SVE and Advanced SIMD element orders.
    
    The same thing applies to ACLE intrinsics.  The current lowering of
    svzipq1 etc. is correct for both endiannesses.  If ACLE code does:
    
      2x svld1_s32 + svzipq1_s32 + svst1_s32
    
    then the byte-for-byte result is the same for both endiannesses.
    On big-endian targets, this is different from using the Advanced SIMD
    sequence below for each 128-bit block:
    
      2x LDR + ZIP1 + STR
    
    In contrast, the byte-for-byte result of:
    
      2x svld1q_gather_s32 + svzipq1_s32 + svst11_scatter_s32
    
    depends on endianness, since the quadword gathers and scatters use
    Advanced SIMD byte ordering for each 128-bit block.  This gather/scatter
    sequence behaves in the same way as the Advanced SIMD LDR+ZIP1+STR
    sequence for both endiannesses.
    
    Programmers writing ACLE code have to be aware of this difference
    if they want to support both endiannesses.
    
    The patch includes some new execution tests to verify the expansion
    of the VEC_PERM_EXPRs.
    
    gcc/
            * doc/sourcebuild.texi (aarch64_sve2_hw, aarch64_sve2p1_hw): 
Document.
            * config/aarch64/aarch64.cc (aarch64_evpc_hvla): Extend to
            BYTES_BIG_ENDIAN.
    
    gcc/testsuite/
            * lib/target-supports.exp 
(check_effective_target_aarch64_sve2p1_hw):
            New proc.
            * gcc.target/aarch64/sve2/dupq_1.c: Extend to big-endian.  Add
            noipa attributes.
            * gcc.target/aarch64/sve2/extq_1.c: Likewise.
            * gcc.target/aarch64/sve2/uzpq_1.c: Likewise.
            * gcc.target/aarch64/sve2/zipq_1.c: Likewise.
            * gcc.target/aarch64/sve2/dupq_1_run.c: New test.
            * gcc.target/aarch64/sve2/extq_1_run.c: Likewise.
            * gcc.target/aarch64/sve2/uzpq_1_run.c: Likewise.
            * gcc.target/aarch64/sve2/zipq_1_run.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64.cc                      |  1 -
 gcc/doc/sourcebuild.texi                           |  6 ++
 gcc/testsuite/gcc.target/aarch64/sve2/dupq_1.c     | 26 +++----
 gcc/testsuite/gcc.target/aarch64/sve2/dupq_1_run.c | 87 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/sve2/extq_1.c     | 20 ++---
 gcc/testsuite/gcc.target/aarch64/sve2/extq_1_run.c | 73 ++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1.c     | 18 ++---
 gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1_run.c | 78 +++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/sve2/zipq_1.c     | 18 ++---
 gcc/testsuite/gcc.target/aarch64/sve2/zipq_1_run.c | 78 +++++++++++++++++++
 gcc/testsuite/lib/target-supports.exp              | 17 +++++
 11 files changed, 380 insertions(+), 42 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 477cbece6c98..27c315fc35e8 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26801,7 +26801,6 @@ aarch64_evpc_hvla (struct expand_vec_perm_d *d)
   machine_mode vmode = d->vmode;
   if (!TARGET_SVE2p1
       || !TARGET_NON_STREAMING
-      || BYTES_BIG_ENDIAN
       || d->vec_flags != VEC_SVE_DATA
       || GET_MODE_UNIT_BITSIZE (vmode) > 64)
     return false;
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 6c5586e4b034..85fb810d96c5 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -2373,6 +2373,12 @@ whether it does so by default).
 @itemx aarch64_sve1024_hw
 @itemx aarch64_sve2048_hw
 Like @code{aarch64_sve_hw}, but also test for an exact hardware vector length.
+@item aarch64_sve2_hw
+AArch64 target that is able to generate and execute SVE2 code (regardless of
+whether it does so by default).
+@item aarch64_sve2p1_hw
+AArch64 target that is able to generate and execute SVE2.1 code (regardless of
+whether it does so by default).
 
 @item aarch64_fjcvtzs_hw
 AArch64 target that is able to generate and execute armv8.3-a FJCVTZS
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1.c
index 5472e30f812a..9db60b1ea4f2 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1.c
@@ -1,5 +1,5 @@
 /* { dg-options "-O2 -msve-vector-bits=256" } */
-/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
 
 #include <arm_sve.h>
 
@@ -15,7 +15,7 @@ typedef svuint64_t fixed_uint64_t 
__attribute__((arm_sve_vector_bits(256)));
 **     trn1    z0\.d, z0\.d, z0\.d
 **     ret
 */
-fixed_uint64_t
+[[gnu::noipa]] fixed_uint64_t
 f1 (fixed_uint64_t z0)
 {
   return __builtin_shufflevector (z0, z0, 0, 0, 2, 2);
@@ -26,7 +26,7 @@ f1 (fixed_uint64_t z0)
 **     trn2    z0\.d, z0\.d, z0\.d
 **     ret
 */
-fixed_uint64_t
+[[gnu::noipa]] fixed_uint64_t
 f2 (fixed_uint64_t z0)
 {
   return __builtin_shufflevector (z0, z0, 1, 1, 3, 3);
@@ -37,7 +37,7 @@ f2 (fixed_uint64_t z0)
 **     dupq    z0\.s, z0\.s\[0\]
 **     ret
 */
-fixed_int32_t
+[[gnu::noipa]] fixed_int32_t
 f3 (fixed_int32_t z0)
 {
   return __builtin_shufflevector (z0, z0, 0, 0, 0, 0, 4, 4, 4, 4);
@@ -48,7 +48,7 @@ f3 (fixed_int32_t z0)
 **     dupq    z0\.s, z0\.s\[1\]
 **     ret
 */
-fixed_int32_t
+[[gnu::noipa]] fixed_int32_t
 f4 (fixed_int32_t z0)
 {
   return __builtin_shufflevector (z0, z0, 1, 1, 1, 1, 5, 5, 5, 5);
@@ -59,7 +59,7 @@ f4 (fixed_int32_t z0)
 **     dupq    z0\.s, z0\.s\[2\]
 **     ret
 */
-fixed_int32_t
+[[gnu::noipa]] fixed_int32_t
 f5 (fixed_int32_t z0)
 {
   return __builtin_shufflevector (z0, z0, 2, 2, 2, 2, 6, 6, 6, 6);
@@ -70,7 +70,7 @@ f5 (fixed_int32_t z0)
 **     dupq    z0\.s, z0\.s\[3\]
 **     ret
 */
-fixed_int32_t
+[[gnu::noipa]] fixed_int32_t
 f6 (fixed_int32_t z0)
 {
   return __builtin_shufflevector (z0, z0, 3, 3, 3, 3, 7, 7, 7, 7);
@@ -81,7 +81,7 @@ f6 (fixed_int32_t z0)
 **     dupq    z0\.h, z0\.h\[0\]
 **     ret
 */
-fixed_uint16_t
+[[gnu::noipa]] fixed_uint16_t
 f7 (fixed_uint16_t z0)
 {
   return __builtin_shufflevector (z0, z0,
@@ -95,7 +95,7 @@ f7 (fixed_uint16_t z0)
 **     dupq    z0\.h, z0\.h\[5\]
 **     ret
 */
-fixed_uint16_t
+[[gnu::noipa]] fixed_uint16_t
 f8 (fixed_uint16_t z0)
 {
   return __builtin_shufflevector (z0, z0,
@@ -108,7 +108,7 @@ f8 (fixed_uint16_t z0)
 **     dupq    z0\.h, z0\.h\[7\]
 **     ret
 */
-fixed_uint16_t
+[[gnu::noipa]] fixed_uint16_t
 f9 (fixed_uint16_t z0)
 {
   return __builtin_shufflevector (z0, z0,
@@ -121,7 +121,7 @@ f9 (fixed_uint16_t z0)
 **     dupq    z0\.b, z0\.b\[0\]
 **     ret
 */
-fixed_uint8_t
+[[gnu::noipa]] fixed_uint8_t
 f10 (fixed_uint8_t z0)
 {
   return __builtin_shufflevector (z0, z0,
@@ -136,7 +136,7 @@ f10 (fixed_uint8_t z0)
 **     dupq    z0\.b, z0\.b\[13\]
 **     ret
 */
-fixed_uint8_t
+[[gnu::noipa]] fixed_uint8_t
 f11 (fixed_uint8_t z0)
 {
   return __builtin_shufflevector (z0, z0,
@@ -151,7 +151,7 @@ f11 (fixed_uint8_t z0)
 **     dupq    z0\.b, z0\.b\[15\]
 **     ret
 */
-fixed_uint8_t
+[[gnu::noipa]] fixed_uint8_t
 f12 (fixed_uint8_t z0)
 {
   return __builtin_shufflevector (z0, z0,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1_run.c
new file mode 100644
index 000000000000..fd25034c4b46
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/dupq_1_run.c
@@ -0,0 +1,87 @@
+/* { dg-do run { target { aarch64_sve256_hw && aarch64_sve2p1_hw } } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "dupq_1.c"
+
+#define TEST(A, B)                                                     \
+  do {                                                                 \
+    typeof(B) actual_ = (A);                                           \
+    if (__builtin_memcmp (&actual_, &(B), sizeof (actual_)) != 0)      \
+      __builtin_abort ();                                              \
+  } while (0)
+
+int
+main ()
+{
+  fixed_uint64_t a64 = { 0x1122, -1, 0x5566, -2 };
+  fixed_int32_t a32 = { 0x1122, -0x3344, 0x5566, -0x7788,
+                       0x99aa, -0xbbcc, 0xddee, -0xff00 };
+  fixed_uint16_t a16 = { 0x9a12, 0xbc34, 0xde56, 0xf078,
+                        0x00ff, 0x11ee, 0x22dd, 0x33cc,
+                        0x44bb, 0x55aa, 0x6699, 0x7788,
+                        0xfe01, 0xdc23, 0xba45, 0x9867 };
+  fixed_uint8_t a8 = { 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x70,
+                      0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf8,
+                      0xfe, 0xed, 0xdc, 0xcb, 0xba, 0xa9, 0x98, 0x8f,
+                      0x76, 0x65, 0x54, 0x43, 0x32, 0x21, 0x10, 0x07 };
+
+  fixed_uint64_t expected1 = { 0x1122, 0x1122, 0x5566, 0x5566 };
+  TEST (f1 (a64), expected1);
+
+  fixed_uint64_t expected2 = { -1, -1, -2, -2 };
+  TEST (f2 (a64), expected2);
+
+  fixed_int32_t expected3 = { 0x1122, 0x1122, 0x1122, 0x1122,
+                              0x99aa, 0x99aa, 0x99aa, 0x99aa };
+  TEST (f3 (a32), expected3);
+
+  fixed_int32_t expected4 = { -0x3344, -0x3344, -0x3344, -0x3344,
+                             -0xbbcc, -0xbbcc, -0xbbcc, -0xbbcc };
+  TEST (f4 (a32), expected4);
+
+  fixed_int32_t expected5 = { 0x5566, 0x5566, 0x5566, 0x5566,
+                              0xddee, 0xddee, 0xddee, 0xddee };
+  TEST (f5 (a32), expected5);
+
+  fixed_int32_t expected6 = { -0x7788, -0x7788, -0x7788, -0x7788,
+                             -0xff00, -0xff00, -0xff00, -0xff00 };
+  TEST (f6 (a32), expected6);
+
+  fixed_uint16_t expected7 = { 0x9a12, 0x9a12, 0x9a12, 0x9a12,
+                              0x9a12, 0x9a12, 0x9a12, 0x9a12,
+                              0x44bb, 0x44bb, 0x44bb, 0x44bb,
+                              0x44bb, 0x44bb, 0x44bb, 0x44bb };
+  TEST (f7 (a16), expected7);
+
+  fixed_uint16_t expected8 = { 0x11ee, 0x11ee, 0x11ee, 0x11ee,
+                              0x11ee, 0x11ee, 0x11ee, 0x11ee,
+                              0xdc23, 0xdc23, 0xdc23, 0xdc23,
+                              0xdc23, 0xdc23, 0xdc23, 0xdc23 };
+  TEST (f8 (a16), expected8);
+
+  fixed_uint16_t expected9 = { 0x33cc, 0x33cc, 0x33cc, 0x33cc,
+                              0x33cc, 0x33cc, 0x33cc, 0x33cc,
+                              0x9867, 0x9867, 0x9867, 0x9867,
+                              0x9867, 0x9867, 0x9867, 0x9867 };
+  TEST (f9 (a16), expected9);
+
+  fixed_uint8_t expected10 = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                              0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                              0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+                              0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe };
+  TEST (f10 (a8), expected10);
+
+  fixed_uint8_t expected11 = { 0xde, 0xde, 0xde, 0xde, 0xde, 0xde, 0xde, 0xde,
+                              0xde, 0xde, 0xde, 0xde, 0xde, 0xde, 0xde, 0xde,
+                              0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21,
+                              0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21 };
+  TEST (f11 (a8), expected11);
+
+  fixed_uint8_t expected12 = { 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
+                              0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
+                              0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+                              0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 };
+  TEST (f12 (a8), expected12);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/extq_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/extq_1.c
index 03c5fb143f7e..be5ae71de83c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/extq_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/extq_1.c
@@ -1,5 +1,5 @@
 /* { dg-options "-O2 -msve-vector-bits=256" } */
-/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
 
 #include <arm_sve.h>
 
@@ -15,7 +15,7 @@ typedef svfloat64_t fixed_float64_t 
__attribute__((arm_sve_vector_bits(256)));
 **     extq    z0\.b, z0\.b, z1\.b, #8
 **     ret
 */
-fixed_float64_t
+[[gnu::noipa]] fixed_float64_t
 f1 (fixed_float64_t z0, fixed_float64_t z1)
 {
   return __builtin_shufflevector (z0, z1, 1, 4, 3, 6);
@@ -26,7 +26,7 @@ f1 (fixed_float64_t z0, fixed_float64_t z1)
 **     extq    z0\.b, z0\.b, z1\.b, #4
 **     ret
 */
-fixed_uint32_t
+[[gnu::noipa]] fixed_uint32_t
 f2 (fixed_uint32_t z0, fixed_uint32_t z1)
 {
   return __builtin_shufflevector (z0, z1, 1, 2, 3, 8, 5, 6, 7, 12);
@@ -37,7 +37,7 @@ f2 (fixed_uint32_t z0, fixed_uint32_t z1)
 **     extq    z0\.b, z0\.b, z1\.b, #12
 **     ret
 */
-fixed_uint32_t
+[[gnu::noipa]] fixed_uint32_t
 f3 (fixed_uint32_t z0, fixed_uint32_t z1)
 {
   return __builtin_shufflevector (z0, z1, 3, 8, 9, 10, 7, 12, 13, 14);
@@ -48,7 +48,7 @@ f3 (fixed_uint32_t z0, fixed_uint32_t z1)
 **     extq    z0\.b, z0\.b, z1\.b, #2
 **     ret
 */
-fixed_float16_t
+[[gnu::noipa]] fixed_float16_t
 f4 (fixed_float16_t z0, fixed_float16_t z1)
 {
   return __builtin_shufflevector (z0, z1,
@@ -61,7 +61,7 @@ f4 (fixed_float16_t z0, fixed_float16_t z1)
 **     extq    z0\.b, z0\.b, z1\.b, #10
 **     ret
 */
-fixed_float16_t
+[[gnu::noipa]] fixed_float16_t
 f5 (fixed_float16_t z0, fixed_float16_t z1)
 {
   return __builtin_shufflevector (z0, z1,
@@ -74,7 +74,7 @@ f5 (fixed_float16_t z0, fixed_float16_t z1)
 **     extq    z0\.b, z0\.b, z1\.b, #14
 **     ret
 */
-fixed_float16_t
+[[gnu::noipa]] fixed_float16_t
 f6 (fixed_float16_t z0, fixed_float16_t z1)
 {
   return __builtin_shufflevector (z0, z1,
@@ -87,7 +87,7 @@ f6 (fixed_float16_t z0, fixed_float16_t z1)
 **     extq    z0\.b, z0\.b, z1\.b, #1
 **     ret
 */
-fixed_int8_t
+[[gnu::noipa]] fixed_int8_t
 f7 (fixed_int8_t z0, fixed_int8_t z1)
 {
   return __builtin_shufflevector (z0, z1,
@@ -102,7 +102,7 @@ f7 (fixed_int8_t z0, fixed_int8_t z1)
 **     extq    z0\.b, z0\.b, z1\.b, #11
 **     ret
 */
-fixed_int8_t
+[[gnu::noipa]] fixed_int8_t
 f8 (fixed_int8_t z0, fixed_int8_t z1)
 {
   return __builtin_shufflevector (z0, z1,
@@ -117,7 +117,7 @@ f8 (fixed_int8_t z0, fixed_int8_t z1)
 **     extq    z0\.b, z0\.b, z1\.b, #15
 **     ret
 */
-fixed_int8_t
+[[gnu::noipa]] fixed_int8_t
 f9 (fixed_int8_t z0, fixed_int8_t z1)
 {
   return __builtin_shufflevector (z0, z1,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/extq_1_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/extq_1_run.c
new file mode 100644
index 000000000000..6b72c98a22cb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/extq_1_run.c
@@ -0,0 +1,73 @@
+/* { dg-do run { target { aarch64_sve256_hw && aarch64_sve2p1_hw } } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "extq_1.c"
+
+#define TEST(A, B)                                                     \
+  do {                                                                 \
+    typeof(B) actual_ = (A);                                           \
+    if (__builtin_memcmp (&actual_, &(B), sizeof (actual_)) != 0)      \
+      __builtin_abort ();                                              \
+  } while (0)
+
+int
+main ()
+{
+  fixed_float64_t a64 = { 1.5, 3.75, -5.25, 9 };
+  fixed_float64_t b64 = { -2, 4.125, -6.375, 11.5 };
+  fixed_float64_t expected1 = { 3.75, -2, 9, -6.375 };
+  TEST (f1 (a64, b64), expected1);
+
+  fixed_uint32_t a32 = { 0x1122, -0x3344, 0x5566, -0x7788,
+                        0x99aa, -0xbbcc, 0xddee, -0xff00 };
+  fixed_uint32_t b32 = { 1 << 20, 1 << 21, 1 << 22, 1 << 23,
+                        5 << 6, 5 << 7, 5 << 8, 5 << 9 };
+  fixed_uint32_t expected2 = { -0x3344, 0x5566, -0x7788, 1 << 20,
+                              -0xbbcc, 0xddee, -0xff00, 5 << 6 };
+  fixed_uint32_t expected3 = { -0x7788, 1 << 20, 1 << 21, 1 << 22,
+                              -0xff00, 5 << 6, 5 << 7, 5 << 8 };
+  TEST (f2 (a32, b32), expected2);
+  TEST (f3 (a32, b32), expected3);
+
+  fixed_float16_t a16 = { 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25,
+                         2.5, 2.75, 3, 3.25, 3.5, 3.75, 4, 4.25 };
+  fixed_float16_t b16 = { -0.5, -0.75, -1, -1.25, -1.5, -1.75, -2, -2.25,
+                         -2.5, -2.75, -3, -3.25, -3.5, -3.75, -4, -4.25 };
+  fixed_float16_t expected4 = { 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, -0.5,
+                               2.75, 3, 3.25, 3.5, 3.75, 4, 4.25, -2.5 };
+  fixed_float16_t expected5 = { 1.75, 2, 2.25, -0.5, -0.75, -1, -1.25, -1.5,
+                               3.75, 4, 4.25, -2.5, -2.75, -3, -3.25, -3.5 };
+  fixed_float16_t expected6 = { 2.25, -0.5, -0.75, -1,
+                               -1.25, -1.5, -1.75, -2,
+                               4.25, -2.5, -2.75, -3,
+                               -3.25, -3.5, -3.75, -4 };
+  TEST (f4 (a16, b16), expected4);
+  TEST (f5 (a16, b16), expected5);
+  TEST (f6 (a16, b16), expected6);
+
+  fixed_int8_t a8 = { 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x70,
+                     0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf8,
+                     0xfe, 0xed, 0xdc, 0xcb, 0xba, 0xa9, 0x98, 0x8f,
+                     0x76, 0x65, 0x54, 0x43, 0x32, 0x21, 0x10, 0x07 };
+  fixed_int8_t b8 = { 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88,
+                     0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00,
+                     0x13, 0x24, 0x35, 0x46, 0x57, 0x68, 0x79, 0x8a,
+                     0x9b, 0xac, 0xbd, 0xce, 0xdf, 0xe0, 0xf1, 0x02 };
+  fixed_int8_t expected7 = { 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x70, 0x89,
+                            0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf8, 0x11,
+                            0xed, 0xdc, 0xcb, 0xba, 0xa9, 0x98, 0x8f, 0x76,
+                            0x65, 0x54, 0x43, 0x32, 0x21, 0x10, 0x07, 0x13 };
+  fixed_int8_t expected8 = { 0xbc, 0xcd, 0xde, 0xef, 0xf8, 0x11, 0x22, 0x33,
+                            0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb,
+                            0x43, 0x32, 0x21, 0x10, 0x07, 0x13, 0x24, 0x35,
+                            0x46, 0x57, 0x68, 0x79, 0x8a, 0x9b, 0xac, 0xbd };
+  fixed_int8_t expected9 = { 0xf8, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+                            0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
+                            0x07, 0x13, 0x24, 0x35, 0x46, 0x57, 0x68, 0x79,
+                            0x8a, 0x9b, 0xac, 0xbd, 0xce, 0xdf, 0xe0, 0xf1 };
+  TEST (f7 (a8, b8), expected7);
+  TEST (f8 (a8, b8), expected8);
+  TEST (f9 (a8, b8), expected9);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1.c
index f923e9447ec3..587f67076b64 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1.c
@@ -1,5 +1,5 @@
 /* { dg-options "-O2 -msve-vector-bits=256" } */
-/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
 
 #include <arm_sve.h>
 
@@ -15,7 +15,7 @@ typedef svint64_t fixed_int64_t 
__attribute__((arm_sve_vector_bits(256)));
 **     trn1    z0\.d, z0\.d, z1\.d
 **     ret
 */
-fixed_int64_t
+[[gnu::noipa]] fixed_int64_t
 f1 (fixed_int64_t z0, fixed_int64_t z1)
 {
   return __builtin_shufflevector (z0, z1, 0, 4, 2, 6);
@@ -26,7 +26,7 @@ f1 (fixed_int64_t z0, fixed_int64_t z1)
 **     trn2    z0\.d, z0\.d, z1\.d
 **     ret
 */
-fixed_int64_t
+[[gnu::noipa]] fixed_int64_t
 f2 (fixed_int64_t z0, fixed_int64_t z1)
 {
   return __builtin_shufflevector (z0, z1, 1, 5, 3, 7);
@@ -37,7 +37,7 @@ f2 (fixed_int64_t z0, fixed_int64_t z1)
 **     uzpq1   z0\.s, z0\.s, z1\.s
 **     ret
 */
-fixed_float32_t
+[[gnu::noipa]] fixed_float32_t
 f3 (fixed_float32_t z0, fixed_float32_t z1)
 {
   return __builtin_shufflevector (z0, z1, 0, 2, 8, 10, 4, 6, 12, 14);
@@ -48,7 +48,7 @@ f3 (fixed_float32_t z0, fixed_float32_t z1)
 **     uzpq2   z0\.s, z0\.s, z1\.s
 **     ret
 */
-fixed_float32_t
+[[gnu::noipa]] fixed_float32_t
 f4 (fixed_float32_t z0, fixed_float32_t z1)
 {
   return __builtin_shufflevector (z0, z1, 1, 3, 9, 11, 5, 7, 13, 15);
@@ -59,7 +59,7 @@ f4 (fixed_float32_t z0, fixed_float32_t z1)
 **     uzpq1   z0\.h, z0\.h, z1\.h
 **     ret
 */
-fixed_bfloat16_t
+[[gnu::noipa]] fixed_bfloat16_t
 f5 (fixed_bfloat16_t z0, fixed_bfloat16_t z1)
 {
   return __builtin_shufflevector (z0, z1,
@@ -72,7 +72,7 @@ f5 (fixed_bfloat16_t z0, fixed_bfloat16_t z1)
 **     uzpq2   z0\.h, z0\.h, z1\.h
 **     ret
 */
-fixed_bfloat16_t
+[[gnu::noipa]] fixed_bfloat16_t
 f6 (fixed_bfloat16_t z0, fixed_bfloat16_t z1)
 {
   return __builtin_shufflevector (z0, z1,
@@ -85,7 +85,7 @@ f6 (fixed_bfloat16_t z0, fixed_bfloat16_t z1)
 **     uzpq1   z0\.b, z0\.b, z1\.b
 **     ret
 */
-fixed_uint8_t
+[[gnu::noipa]] fixed_uint8_t
 f7 (fixed_uint8_t z0, fixed_uint8_t z1)
 {
   return __builtin_shufflevector (z0, z1,
@@ -100,7 +100,7 @@ f7 (fixed_uint8_t z0, fixed_uint8_t z1)
 **     uzpq2   z0\.b, z0\.b, z1\.b
 **     ret
 */
-fixed_uint8_t
+[[gnu::noipa]] fixed_uint8_t
 f8 (fixed_uint8_t z0, fixed_uint8_t z1)
 {
   return __builtin_shufflevector (z0, z1,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1_run.c
new file mode 100644
index 000000000000..9044cae659b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/uzpq_1_run.c
@@ -0,0 +1,78 @@
+/* { dg-do run { target { aarch64_sve256_hw && aarch64_sve2p1_hw } } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "uzpq_1.c"
+
+typedef svuint16_t fixed_uint16_t __attribute__((arm_sve_vector_bits(256)));
+
+#define TEST(A, B)                                                     \
+  do {                                                                 \
+    typeof(A) actual_ = (A);                                           \
+    if (__builtin_memcmp (&actual_, &(B), sizeof (actual_)) != 0)      \
+      __builtin_abort ();                                              \
+  } while (0)
+
+int
+main ()
+{
+  fixed_int64_t a64 = { 0x1122LL << 31, -1LL << 47, 0x5566 << 15, -2 };
+  fixed_int64_t b64 = { 42, -0x3344LL << 19, 303, -0x7788LL << 27 };
+  fixed_int64_t expected1 = { 0x1122LL << 31, 42,
+                             0x5566 << 15, 303 };
+  fixed_int64_t expected2 = { -1LL << 47, -0x3344LL << 19,
+                             -2, -0x7788LL << 27 };
+  TEST (f1 (a64, b64), expected1);
+  TEST (f2 (a64, b64), expected2);
+
+  fixed_float32_t a32 = { 0.5, 0.75, 1, 1.25, 2.5, 2.75, 3, 3.25 };
+  fixed_float32_t b32 = { -0.5, -0.75, -1, -1.25, -2.5, -2.75, -3, -3.25 };
+  fixed_float32_t expected3 = { 0.5, 1, -0.5, -1,
+                               2.5, 3, -2.5, -3 };
+  fixed_float32_t expected4 = { 0.75, 1.25, -0.75, -1.25,
+                               2.75, 3.25, -2.75, -3.25 };
+  TEST (f3 (a32, b32), expected3);
+  TEST (f4 (a32, b32), expected4);
+
+  fixed_uint16_t a16_i = { 0x9a12, 0xbc34, 0xde56, 0xf078,
+                          0x00ff, 0x11ee, 0x22dd, 0x33cc,
+                          0x44bb, 0x55aa, 0x6699, 0x7788,
+                          0xfe01, 0xdc23, 0xba45, 0x9867 };
+  fixed_uint16_t b16_i = { 0x1010, 0x2020, 0x3030, 0x4040,
+                          0x5050, 0x6060, 0x7070, 0x8080,
+                          0x9090, 0xa0a0, 0xb0b0, 0xc0c0,
+                          0xd0d0, 0xe0e0, 0xf0f0, 0x0f0f };
+  fixed_uint16_t expected5 = { 0x9a12, 0xde56, 0x00ff, 0x22dd,
+                              0x1010, 0x3030, 0x5050, 0x7070,
+                              0x44bb, 0x6699, 0xfe01, 0xba45,
+                              0x9090, 0xb0b0, 0xd0d0, 0xf0f0 };
+  fixed_uint16_t expected6 = { 0xbc34, 0xf078, 0x11ee, 0x33cc,
+                              0x2020, 0x4040, 0x6060, 0x8080,
+                              0x55aa, 0x7788, 0xdc23, 0x9867,
+                              0xa0a0, 0xc0c0, 0xe0e0, 0x0f0f };
+  fixed_bfloat16_t a16, b16;
+  __builtin_memcpy (&a16, &a16_i, sizeof (a16));
+  __builtin_memcpy (&b16, &b16_i, sizeof (b16));
+  TEST (f5 (a16, b16), expected5);
+  TEST (f6 (a16, b16), expected6);
+
+  fixed_uint8_t a8 = { 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x70,
+                      0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf8,
+                      0xfe, 0xed, 0xdc, 0xcb, 0xba, 0xa9, 0x98, 0x8f,
+                      0x76, 0x65, 0x54, 0x43, 0x32, 0x21, 0x10, 0x07 };
+  fixed_uint8_t b8 = { 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88,
+                      0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00,
+                      0x13, 0x24, 0x35, 0x46, 0x57, 0x68, 0x79, 0x8a,
+                      0x9b, 0xac, 0xbd, 0xce, 0xdf, 0xe0, 0xf1, 0x02 };
+  fixed_uint8_t expected7 = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef,
+                             0x11, 0x33, 0x55, 0x77, 0x99, 0xbb, 0xdd, 0xff,
+                             0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
+                             0x13, 0x35, 0x57, 0x79, 0x9b, 0xbd, 0xdf, 0xf1 };
+  fixed_uint8_t expected8 = { 0x12, 0x34, 0x56, 0x70, 0x9a, 0xbc, 0xde, 0xf8,
+                             0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x00,
+                             0xed, 0xcb, 0xa9, 0x8f, 0x65, 0x43, 0x21, 0x07,
+                             0x24, 0x46, 0x68, 0x8a, 0xac, 0xce, 0xe0, 0x02 };
+  TEST (f7 (a8, b8), expected7);
+  TEST (f8 (a8, b8), expected8);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1.c
index fa420a959c72..76fb4b4440b7 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1.c
@@ -1,5 +1,5 @@
 /* { dg-options "-O2 -msve-vector-bits=256" } */
-/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
 
 #include <arm_sve.h>
 
@@ -15,7 +15,7 @@ typedef svint64_t fixed_int64_t 
__attribute__((arm_sve_vector_bits(256)));
 **     trn1    z0\.d, z0\.d, z1\.d
 **     ret
 */
-fixed_int64_t
+[[gnu::noipa]] fixed_int64_t
 f1 (fixed_int64_t z0, fixed_int64_t z1)
 {
   return __builtin_shufflevector (z0, z1, 0, 4, 2, 6);
@@ -26,7 +26,7 @@ f1 (fixed_int64_t z0, fixed_int64_t z1)
 **     trn2    z0\.d, z0\.d, z1\.d
 **     ret
 */
-fixed_int64_t
+[[gnu::noipa]] fixed_int64_t
 f2 (fixed_int64_t z0, fixed_int64_t z1)
 {
   return __builtin_shufflevector (z0, z1, 1, 5, 3, 7);
@@ -37,7 +37,7 @@ f2 (fixed_int64_t z0, fixed_int64_t z1)
 **     zipq1   z0\.s, z0\.s, z1\.s
 **     ret
 */
-fixed_float32_t
+[[gnu::noipa]] fixed_float32_t
 f3 (fixed_float32_t z0, fixed_float32_t z1)
 {
   return __builtin_shufflevector (z0, z1, 0, 8, 1, 9, 4, 12, 5, 13);
@@ -48,7 +48,7 @@ f3 (fixed_float32_t z0, fixed_float32_t z1)
 **     zipq2   z0\.s, z0\.s, z1\.s
 **     ret
 */
-fixed_float32_t
+[[gnu::noipa]] fixed_float32_t
 f4 (fixed_float32_t z0, fixed_float32_t z1)
 {
   return __builtin_shufflevector (z0, z1, 2, 10, 3, 11, 6, 14, 7, 15);
@@ -59,7 +59,7 @@ f4 (fixed_float32_t z0, fixed_float32_t z1)
 **     zipq1   z0\.h, z0\.h, z1\.h
 **     ret
 */
-fixed_bfloat16_t
+[[gnu::noipa]] fixed_bfloat16_t
 f5 (fixed_bfloat16_t z0, fixed_bfloat16_t z1)
 {
   return __builtin_shufflevector (z0, z1,
@@ -72,7 +72,7 @@ f5 (fixed_bfloat16_t z0, fixed_bfloat16_t z1)
 **     zipq2   z0\.h, z0\.h, z1\.h
 **     ret
 */
-fixed_bfloat16_t
+[[gnu::noipa]] fixed_bfloat16_t
 f6 (fixed_bfloat16_t z0, fixed_bfloat16_t z1)
 {
   return __builtin_shufflevector (z0, z1,
@@ -85,7 +85,7 @@ f6 (fixed_bfloat16_t z0, fixed_bfloat16_t z1)
 **     zipq1   z0\.b, z0\.b, z1\.b
 **     ret
 */
-fixed_uint8_t
+[[gnu::noipa]] fixed_uint8_t
 f7 (fixed_uint8_t z0, fixed_uint8_t z1)
 {
   return __builtin_shufflevector (z0, z1,
@@ -100,7 +100,7 @@ f7 (fixed_uint8_t z0, fixed_uint8_t z1)
 **     zipq2   z0\.b, z0\.b, z1\.b
 **     ret
 */
-fixed_uint8_t
+[[gnu::noipa]] fixed_uint8_t
 f8 (fixed_uint8_t z0, fixed_uint8_t z1)
 {
   return __builtin_shufflevector (z0, z1,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1_run.c
new file mode 100644
index 000000000000..211f9d945edf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/zipq_1_run.c
@@ -0,0 +1,78 @@
+/* { dg-do run { target { aarch64_sve256_hw && aarch64_sve2p1_hw } } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "zipq_1.c"
+
+typedef svuint16_t fixed_uint16_t __attribute__((arm_sve_vector_bits(256)));
+
+#define TEST(A, B)                                                     \
+  do {                                                                 \
+    typeof(A) actual_ = (A);                                           \
+    if (__builtin_memcmp (&actual_, &(B), sizeof (actual_)) != 0)      \
+      __builtin_abort ();                                              \
+  } while (0)
+
+int
+main ()
+{
+  fixed_int64_t a64 = { 0x1122LL << 31, -1LL << 47, 0x5566 << 15, -2 };
+  fixed_int64_t b64 = { 42, -0x3344LL << 19, 303, -0x7788LL << 27 };
+  fixed_int64_t expected1 = { 0x1122LL << 31, 42,
+                             0x5566 << 15, 303 };
+  fixed_int64_t expected2 = { -1LL << 47, -0x3344LL << 19,
+                             -2, -0x7788LL << 27 };
+  TEST (f1 (a64, b64), expected1);
+  TEST (f2 (a64, b64), expected2);
+
+  fixed_float32_t a32 = { 0.5, 0.75, 1, 1.25, 2.5, 2.75, 3, 3.25 };
+  fixed_float32_t b32 = { -0.5, -0.75, -1, -1.25, -2.5, -2.75, -3, -3.25 };
+  fixed_float32_t expected3 = { 0.5, -0.5, 0.75, -0.75,
+                               2.5, -2.5, 2.75, -2.75 };
+  fixed_float32_t expected4 = { 1, -1, 1.25, -1.25,
+                               3, -3, 3.25, -3.25 };
+  TEST (f3 (a32, b32), expected3);
+  TEST (f4 (a32, b32), expected4);
+
+  fixed_uint16_t a16_i = { 0x9a12, 0xbc34, 0xde56, 0xf078,
+                          0x00ff, 0x11ee, 0x22dd, 0x33cc,
+                          0x44bb, 0x55aa, 0x6699, 0x7788,
+                          0xfe01, 0xdc23, 0xba45, 0x9867 };
+  fixed_uint16_t b16_i = { 0x1010, 0x2020, 0x3030, 0x4040,
+                          0x5050, 0x6060, 0x7070, 0x8080,
+                          0x9090, 0xa0a0, 0xb0b0, 0xc0c0,
+                          0xd0d0, 0xe0e0, 0xf0f0, 0x0f0f };
+  fixed_uint16_t expected5 = { 0x9a12, 0x1010, 0xbc34, 0x2020,
+                              0xde56, 0x3030, 0xf078, 0x4040,
+                              0x44bb, 0x9090, 0x55aa, 0xa0a0,
+                              0x6699, 0xb0b0, 0x7788, 0xc0c0 };
+  fixed_uint16_t expected6 = { 0x00ff, 0x5050, 0x11ee, 0x6060,
+                              0x22dd, 0x7070, 0x33cc, 0x8080,
+                              0xfe01, 0xd0d0, 0xdc23, 0xe0e0,
+                              0xba45, 0xf0f0, 0x9867, 0x0f0f };
+  fixed_bfloat16_t a16, b16;
+  __builtin_memcpy (&a16, &a16_i, sizeof (a16));
+  __builtin_memcpy (&b16, &b16_i, sizeof (b16));
+  TEST (f5 (a16, b16), expected5);
+  TEST (f6 (a16, b16), expected6);
+
+  fixed_uint8_t a8 = { 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x70,
+                      0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf8,
+                      0xfe, 0xed, 0xdc, 0xcb, 0xba, 0xa9, 0x98, 0x8f,
+                      0x76, 0x65, 0x54, 0x43, 0x32, 0x21, 0x10, 0x07 };
+  fixed_uint8_t b8 = { 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88,
+                      0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00,
+                      0x13, 0x24, 0x35, 0x46, 0x57, 0x68, 0x79, 0x8a,
+                      0x9b, 0xac, 0xbd, 0xce, 0xdf, 0xe0, 0xf1, 0x02 };
+  fixed_uint8_t expected7 = { 0x01, 0x11, 0x12, 0x22, 0x23, 0x33, 0x34, 0x44,
+                             0x45, 0x55, 0x56, 0x66, 0x67, 0x77, 0x70, 0x88,
+                             0xfe, 0x13, 0xed, 0x24, 0xdc, 0x35, 0xcb, 0x46,
+                             0xba, 0x57, 0xa9, 0x68, 0x98, 0x79, 0x8f, 0x8a };
+  fixed_uint8_t expected8 = { 0x89, 0x99, 0x9a, 0xaa, 0xab, 0xbb, 0xbc, 0xcc,
+                             0xcd, 0xdd, 0xde, 0xee, 0xef, 0xff, 0xf8, 0x00,
+                             0x76, 0x9b, 0x65, 0xac, 0x54, 0xbd, 0x43, 0xce,
+                             0x32, 0xdf, 0x21, 0xe0, 0x10, 0xf1, 0x07, 0x02 };
+  TEST (f7 (a8, b8), expected7);
+  TEST (f8 (a8, b8), expected8);
+
+  return 0;
+}
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 956bc0bc7ca4..9ab46a0eab43 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -6491,6 +6491,23 @@ proc check_effective_target_aarch64_sve2_hw { } {
     }]
 }
 
+# Return true if this is an AArch64 target that can run SVE2.1 code.
+
+proc check_effective_target_aarch64_sve2p1_hw { } {
+    if { ![istarget aarch64*-*-*] } {
+       return 0
+    }
+    return [check_runtime aarch64_sve2p1_hw_available {
+       #pragma GCC target "+sve2p1"
+       int
+       main (void)
+       {
+         asm volatile ("dupq z0.b, z0.b[0]");
+         return 0;
+       }
+    }]
+}
+
 # Return true if this is an AArch64 target that can run SVE code and
 # if its SVE vectors have exactly BITS bits.

[gcc r16-2164] aarch64: Extend HVLA permutations to big-endian

Reply via email to