https://gcc.gnu.org/g:c83e2d47574fd9a21f257e0f0d7e350c3f1b0618
commit r15-5324-gc83e2d47574fd9a21f257e0f0d7e350c3f1b0618 Author: Jennifer Schmitz <jschm...@nvidia.com> Date: Mon Nov 4 07:56:09 2024 -0800 match.pd: Fold vec_perm with view_convert This patch improves the codegen for the following test case: uint64x2_t foo (uint64x2_t r) { uint32x4_t a = vreinterpretq_u32_u64 (r); uint32_t t; t = a[0]; a[0] = a[1]; a[1] = t; t = a[2]; a[2] = a[3]; a[3] = t; return vreinterpretq_u64_u32 (a); } from (-O1): foo: mov v31.16b, v0.16b ins v0.s[0], v0.s[1] ins v0.s[1], v31.s[0] ins v0.s[2], v31.s[3] ins v0.s[3], v31.s[2] ret to: foo: rev64 v0.4s, v0.4s ret This is achieved by extending the following match.pd pattern to account for type differences between @0 and @1 due to view converts. /* Simplify vector inserts of other vector extracts to a permute. */ (simplify (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos) The patch was bootstrapped and regtested on aarch64-linux-gnu and x86_64-linux-gnu, no regression. OK for mainline? Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> Co-authored-by: Richard Biener <rguent...@suse.de> gcc/ PR tree-optimization/117093 * match.pd: Extend (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos) to allow type differences between @0 and @1 due to view converts. gcc/testsuite/ PR tree-optimization/117093 * gcc.dg/tree-ssa/pr117093.c: New test. Diff: --- gcc/match.pd | 13 ++++++++----- gcc/testsuite/gcc.dg/tree-ssa/pr117093.c | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/gcc/match.pd b/gcc/match.pd index 0ac5674f24be..753bf811f67a 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -9583,7 +9583,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (if (VECTOR_TYPE_P (type) && (VECTOR_MODE_P (TYPE_MODE (type)) || optimize_vectors_before_lowering_p ()) - && types_match (@0, @1) + && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)), + TYPE_SIZE (TREE_TYPE (@1)), 0) && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2)) && TYPE_VECTOR_SUBPARTS (type).is_constant () && multiple_p (wi::to_poly_offset (@rpos), @@ -9591,7 +9592,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (with { unsigned HOST_WIDE_INT elsz - = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1)))); + = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0)))); poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz); poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz); unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant (); @@ -9602,9 +9603,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) vec_perm_indices sel (builder, 2, nunits); } (if (!VECTOR_MODE_P (TYPE_MODE (type)) - || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, false)) - (vec_perm @0 @1 { vec_perm_indices_to_tree - (build_vector_type (ssizetype, nunits), sel); }))))) + || can_vec_perm_const_p (TYPE_MODE (type), + TYPE_MODE (type), sel, false)) + (vec_perm @0 (view_convert @1) + { vec_perm_indices_to_tree (build_vector_type (ssizetype, nunits), + sel); }))))) (if (canonicalize_math_after_vectorization_p ()) (for fmas (FMA) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c new file mode 100644 index 000000000000..0fea32919dd0 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c @@ -0,0 +1,17 @@ +/* { dg-final { check-function-bodies "**" "" } } */ +/* { dg-options "-O1" } */ + +#include <arm_neon.h> + +/* +** foo: +** rev64 v0\.4s, v0\.4s +** ret +*/ +uint64x2_t foo (uint64x2_t r) { + uint32x4_t a = vreinterpretq_u32_u64 (r); + uint32_t t; + t = a[0]; a[0] = a[1]; a[1] = t; + t = a[2]; a[2] = a[3]; a[3] = t; + return vreinterpretq_u64_u32 (a); +}