https://gcc.gnu.org/g:c83e2d47574fd9a21f257e0f0d7e350c3f1b0618

commit r15-5324-gc83e2d47574fd9a21f257e0f0d7e350c3f1b0618
Author: Jennifer Schmitz <jschm...@nvidia.com>
Date:   Mon Nov 4 07:56:09 2024 -0800

    match.pd: Fold vec_perm with view_convert
    
    This patch improves the codegen for the following test case:
    uint64x2_t foo (uint64x2_t r) {
        uint32x4_t a = vreinterpretq_u32_u64 (r);
        uint32_t t;
        t = a[0]; a[0] = a[1]; a[1] = t;
        t = a[2]; a[2] = a[3]; a[3] = t;
        return vreinterpretq_u64_u32 (a);
    }
    from (-O1):
    foo:
            mov     v31.16b, v0.16b
            ins     v0.s[0], v0.s[1]
            ins     v0.s[1], v31.s[0]
            ins     v0.s[2], v31.s[3]
            ins     v0.s[3], v31.s[2]
            ret
    to:
    foo:
            rev64   v0.4s, v0.4s
            ret
    
    This is achieved by extending the following match.pd pattern to account
    for type differences between @0 and @1 due to view converts.
    /* Simplify vector inserts of other vector extracts to a permute.  */
    (simplify
     (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
    
    The patch was bootstrapped and regtested on aarch64-linux-gnu and
    x86_64-linux-gnu, no regression.
    OK for mainline?
    
    Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>
    Co-authored-by: Richard Biener <rguent...@suse.de>
    
    gcc/
            PR tree-optimization/117093
            * match.pd: Extend
            (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos) to allow
            type differences between @0 and @1 due to view converts.
    
    gcc/testsuite/
            PR tree-optimization/117093
            * gcc.dg/tree-ssa/pr117093.c: New test.

Diff:
---
 gcc/match.pd                             | 13 ++++++++-----
 gcc/testsuite/gcc.dg/tree-ssa/pr117093.c | 17 +++++++++++++++++
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 0ac5674f24be..753bf811f67a 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -9583,7 +9583,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (VECTOR_TYPE_P (type)
       && (VECTOR_MODE_P (TYPE_MODE (type))
          || optimize_vectors_before_lowering_p ())
-      && types_match (@0, @1)
+      && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
+                         TYPE_SIZE (TREE_TYPE (@1)), 0)
       && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
       && TYPE_VECTOR_SUBPARTS (type).is_constant ()
       && multiple_p (wi::to_poly_offset (@rpos),
@@ -9591,7 +9592,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (with
    {
      unsigned HOST_WIDE_INT elsz
-       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
+       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0))));
      poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
      poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
      unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
@@ -9602,9 +9603,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
      vec_perm_indices sel (builder, 2, nunits);
    }
    (if (!VECTOR_MODE_P (TYPE_MODE (type))
-       || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, 
false))
-    (vec_perm @0 @1 { vec_perm_indices_to_tree
-                        (build_vector_type (ssizetype, nunits), sel); })))))
+       || can_vec_perm_const_p (TYPE_MODE (type),
+                                TYPE_MODE (type), sel, false))
+    (vec_perm @0 (view_convert @1)
+     { vec_perm_indices_to_tree (build_vector_type (ssizetype, nunits),
+                                sel); })))))
 
 (if (canonicalize_math_after_vectorization_p ())
  (for fmas (FMA)
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
new file mode 100644
index 000000000000..0fea32919dd0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
@@ -0,0 +1,17 @@
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+/*
+** foo:
+**     rev64   v0\.4s, v0\.4s
+**     ret
+*/
+uint64x2_t foo (uint64x2_t r) {
+    uint32x4_t a = vreinterpretq_u32_u64 (r);
+    uint32_t t;
+    t = a[0]; a[0] = a[1]; a[1] = t;
+    t = a[2]; a[2] = a[3]; a[3] = t;
+    return vreinterpretq_u64_u32 (a);
+}

Reply via email to