https://gcc.gnu.org/g:7bb1933c1fcca8cb99132ff5d08b3f3efca4ff07
commit r16-1112-g7bb1933c1fcca8cb99132ff5d08b3f3efca4ff07 Author: Pengfei Li <pengfei....@arm.com> Date: Wed Jun 4 16:59:44 2025 +0100 match.pd: Fold (x + y) >> 1 into IFN_AVG_FLOOR (x, y) for vectors This patch folds vector expressions of the form (x + y) >> 1 into IFN_AVG_FLOOR (x, y), reducing instruction count on platforms that support averaging operations. For example, it can help improve the codegen on AArch64 from: add v0.4s, v0.4s, v31.4s ushr v0.4s, v0.4s, 1 to: uhadd v0.4s, v0.4s, v31.4s As this folding is only valid when the most significant bit of each element in both x and y is known to be zero, this patch checks leading zero bits of elements in x and y, and extends get_nonzero_bits_1() to handle uniform vectors. When the input is a uniform vector, the function now returns the nonzero bits of its element. Additionally, this patch adds more checks to reject vector types in bit constant propagation (tree-bit-ccp), since tree-bit-ccp was designed for scalar values only, and the new vector logic in get_non_zero_bits_1() could lead to incorrect propagation results. gcc/ChangeLog: * match.pd: Add folding rule for vector average. * tree-ssa-ccp.cc (get_default_value): Reject vector types. (evaluate_stmt): Reject vector types. * tree-ssanames.cc (get_nonzero_bits_1): Extend to handle uniform vectors. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/uhadd_1.c: New test. Diff: --- gcc/match.pd | 12 +++++++++ gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c | 34 +++++++++++++++++++++++++ gcc/tree-ssa-ccp.cc | 8 +++--- gcc/tree-ssanames.cc | 8 ++++++ 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/gcc/match.pd b/gcc/match.pd index bde9bd649a70..656572423aa7 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -2177,6 +2177,18 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (view_convert (rshift (view_convert:ntype @0) @1)) (convert (rshift (convert:ntype @0) @1)))))) +#if GIMPLE + /* Fold ((x + y) >> 1 into IFN_AVG_FLOOR (x, y) if x and y are vectors in + which each element is known to have at least one leading zero bit. */ +(simplify + (rshift (plus:cs @0 @1) integer_onep) + (if (VECTOR_TYPE_P (type) + && direct_internal_fn_supported_p (IFN_AVG_FLOOR, type, OPTIMIZE_FOR_BOTH) + && wi::clz (get_nonzero_bits (@0)) > 0 + && wi::clz (get_nonzero_bits (@1)) > 0) + (IFN_AVG_FLOOR @0 @1))) +#endif + /* Try to fold (type) X op CST -> (type) (X op ((type-x) CST)) when profitable. For bitwise binary operations apply operand conversions to the diff --git a/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c new file mode 100644 index 000000000000..f1748a199ade --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c @@ -0,0 +1,34 @@ +/* Test if SIMD fused unsigned halving adds are generated */ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include <arm_neon.h> + +#define FUSED_SIMD_UHADD(vectype, q, ts, mask) \ + vectype simd_uhadd ## q ## _ ## ts ## _1 (vectype a) \ + { \ + vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \ + vectype v2 = vdup ## q ## _n_ ## ts (mask); \ + return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \ + } \ + \ + vectype simd_uhadd ## q ## _ ## ts ## _2 (vectype a, vectype b) \ + { \ + vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \ + vectype v2 = vand ## q ## _ ## ts (b, vdup ## q ## _n_ ## ts (mask)); \ + return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \ + } + +FUSED_SIMD_UHADD (uint8x8_t, , u8, 0x7f) +FUSED_SIMD_UHADD (uint8x16_t, q, u8, 0x7f) +FUSED_SIMD_UHADD (uint16x4_t, , u16, 0x7fff) +FUSED_SIMD_UHADD (uint16x8_t, q, u16, 0x7fff) +FUSED_SIMD_UHADD (uint32x2_t, , u32, 0x7fffffff) +FUSED_SIMD_UHADD (uint32x4_t, q, u32, 0x7fffffff) + +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8b,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.16b,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4h,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8h,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.2s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4s,} 2 } } */ diff --git a/gcc/tree-ssa-ccp.cc b/gcc/tree-ssa-ccp.cc index 8d2cbb384c46..3e0c75cf2be8 100644 --- a/gcc/tree-ssa-ccp.cc +++ b/gcc/tree-ssa-ccp.cc @@ -298,7 +298,7 @@ get_default_value (tree var) { val.lattice_val = VARYING; val.mask = -1; - if (flag_tree_bit_ccp) + if (flag_tree_bit_ccp && !VECTOR_TYPE_P (TREE_TYPE (var))) { wide_int nonzero_bits = get_nonzero_bits (var); tree value; @@ -2491,11 +2491,11 @@ evaluate_stmt (gimple *stmt) is_constant = (val.lattice_val == CONSTANT); } + tree lhs = gimple_get_lhs (stmt); if (flag_tree_bit_ccp + && lhs && TREE_CODE (lhs) == SSA_NAME && !VECTOR_TYPE_P (TREE_TYPE (lhs)) && ((is_constant && TREE_CODE (val.value) == INTEGER_CST) - || !is_constant) - && gimple_get_lhs (stmt) - && TREE_CODE (gimple_get_lhs (stmt)) == SSA_NAME) + || !is_constant)) { tree lhs = gimple_get_lhs (stmt); wide_int nonzero_bits = get_nonzero_bits (lhs); diff --git a/gcc/tree-ssanames.cc b/gcc/tree-ssanames.cc index fd2abfe0745d..b6ca8809dc95 100644 --- a/gcc/tree-ssanames.cc +++ b/gcc/tree-ssanames.cc @@ -508,6 +508,14 @@ get_nonzero_bits_1 (const_tree name) /* Use element_precision instead of TYPE_PRECISION so complex and vector types get a non-zero precision. */ unsigned int precision = element_precision (TREE_TYPE (name)); + + if (VECTOR_TYPE_P (TREE_TYPE (name))) + { + tree elem = uniform_vector_p (name); + if (elem) + return get_nonzero_bits_1 (elem); + } + if (TREE_CODE (name) != SSA_NAME) return wi::shwi (-1, precision);