1 into IFN_AVG_FLOOR (x, y) for vectors

Tamar Christina via Gcc-cvs Wed, 04 Jun 2025 09:02:11 -0700

https://gcc.gnu.org/g:7bb1933c1fcca8cb99132ff5d08b3f3efca4ff07


commit r16-1112-g7bb1933c1fcca8cb99132ff5d08b3f3efca4ff07
Author: Pengfei Li <pengfei....@arm.com>
Date:   Wed Jun 4 16:59:44 2025 +0100

    match.pd: Fold (x + y) >> 1 into IFN_AVG_FLOOR (x, y) for vectors
    
    This patch folds vector expressions of the form (x + y) >> 1 into
    IFN_AVG_FLOOR (x, y), reducing instruction count on platforms that
    support averaging operations. For example, it can help improve the
    codegen on AArch64 from:
            add     v0.4s, v0.4s, v31.4s
            ushr    v0.4s, v0.4s, 1
    to:
            uhadd   v0.4s, v0.4s, v31.4s
    
    As this folding is only valid when the most significant bit of each
    element in both x and y is known to be zero, this patch checks leading
    zero bits of elements in x and y, and extends get_nonzero_bits_1() to
    handle uniform vectors. When the input is a uniform vector, the function
    now returns the nonzero bits of its element.
    
    Additionally, this patch adds more checks to reject vector types in bit
    constant propagation (tree-bit-ccp), since tree-bit-ccp was designed for
    scalar values only, and the new vector logic in get_non_zero_bits_1()
    could lead to incorrect propagation results.
    
    gcc/ChangeLog:
    
            * match.pd: Add folding rule for vector average.
            * tree-ssa-ccp.cc (get_default_value): Reject vector types.
            (evaluate_stmt): Reject vector types.
            * tree-ssanames.cc (get_nonzero_bits_1): Extend to handle
            uniform vectors.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/acle/uhadd_1.c: New test.

Diff:
---
 gcc/match.pd                                    | 12 +++++++++
 gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c | 34 +++++++++++++++++++++++++
 gcc/tree-ssa-ccp.cc                             |  8 +++---
 gcc/tree-ssanames.cc                            |  8 ++++++
 4 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index bde9bd649a70..656572423aa7 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -2177,6 +2177,18 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     (view_convert (rshift (view_convert:ntype @0) @1))
     (convert (rshift (convert:ntype @0) @1))))))
 
+#if GIMPLE
+ /* Fold ((x + y) >> 1 into IFN_AVG_FLOOR (x, y) if x and y are vectors in
+    which each element is known to have at least one leading zero bit.  */
+(simplify
+ (rshift (plus:cs @0 @1) integer_onep)
+ (if (VECTOR_TYPE_P (type)
+      && direct_internal_fn_supported_p (IFN_AVG_FLOOR, type, 
OPTIMIZE_FOR_BOTH)
+      && wi::clz (get_nonzero_bits (@0)) > 0
+      && wi::clz (get_nonzero_bits (@1)) > 0)
+  (IFN_AVG_FLOOR @0 @1)))
+#endif
+
 /* Try to fold (type) X op CST -> (type) (X op ((type-x) CST))
    when profitable.
    For bitwise binary operations apply operand conversions to the
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c 
b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
new file mode 100644
index 000000000000..f1748a199ade
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
@@ -0,0 +1,34 @@
+/* Test if SIMD fused unsigned halving adds are generated */
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_neon.h>
+
+#define FUSED_SIMD_UHADD(vectype, q, ts, mask) \
+  vectype simd_uhadd ## q ## _ ## ts ## _1 (vectype a) \
+  { \
+    vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \
+    vectype v2 = vdup ## q ## _n_ ## ts (mask); \
+    return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \
+  } \
+  \
+  vectype simd_uhadd ## q ## _ ## ts ## _2 (vectype a, vectype b) \
+  { \
+    vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \
+    vectype v2 = vand ## q ## _ ## ts (b, vdup ## q ## _n_ ## ts (mask)); \
+    return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \
+  }
+
+FUSED_SIMD_UHADD (uint8x8_t, , u8, 0x7f)
+FUSED_SIMD_UHADD (uint8x16_t, q, u8, 0x7f)
+FUSED_SIMD_UHADD (uint16x4_t, , u16, 0x7fff)
+FUSED_SIMD_UHADD (uint16x8_t, q, u16, 0x7fff)
+FUSED_SIMD_UHADD (uint32x2_t, , u32, 0x7fffffff)
+FUSED_SIMD_UHADD (uint32x4_t, q, u32, 0x7fffffff)
+
+/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8b,} 2 } } */
+/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.16b,} 2 } } */
+/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.2s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4s,} 2 } } */
diff --git a/gcc/tree-ssa-ccp.cc b/gcc/tree-ssa-ccp.cc
index 8d2cbb384c46..3e0c75cf2be8 100644
--- a/gcc/tree-ssa-ccp.cc
+++ b/gcc/tree-ssa-ccp.cc
@@ -298,7 +298,7 @@ get_default_value (tree var)
        {
          val.lattice_val = VARYING;
          val.mask = -1;
-         if (flag_tree_bit_ccp)
+         if (flag_tree_bit_ccp && !VECTOR_TYPE_P (TREE_TYPE (var)))
            {
              wide_int nonzero_bits = get_nonzero_bits (var);
              tree value;
@@ -2491,11 +2491,11 @@ evaluate_stmt (gimple *stmt)
       is_constant = (val.lattice_val == CONSTANT);
     }
 
+  tree lhs = gimple_get_lhs (stmt);
   if (flag_tree_bit_ccp
+      && lhs && TREE_CODE (lhs) == SSA_NAME && !VECTOR_TYPE_P (TREE_TYPE (lhs))
       && ((is_constant && TREE_CODE (val.value) == INTEGER_CST)
-         || !is_constant)
-      && gimple_get_lhs (stmt)
-      && TREE_CODE (gimple_get_lhs (stmt)) == SSA_NAME)
+         || !is_constant))
     {
       tree lhs = gimple_get_lhs (stmt);
       wide_int nonzero_bits = get_nonzero_bits (lhs);
diff --git a/gcc/tree-ssanames.cc b/gcc/tree-ssanames.cc
index fd2abfe0745d..b6ca8809dc95 100644
--- a/gcc/tree-ssanames.cc
+++ b/gcc/tree-ssanames.cc
@@ -508,6 +508,14 @@ get_nonzero_bits_1 (const_tree name)
   /* Use element_precision instead of TYPE_PRECISION so complex and
      vector types get a non-zero precision.  */
   unsigned int precision = element_precision (TREE_TYPE (name));
+
+  if (VECTOR_TYPE_P (TREE_TYPE (name)))
+    {
+      tree elem = uniform_vector_p (name);
+      if (elem)
+       return get_nonzero_bits_1 (elem);
+    }
+
   if (TREE_CODE (name) != SSA_NAME)
     return wi::shwi (-1, precision);

[gcc r16-1112] match.pd: Fold (x + y) >> 1 into IFN_AVG_FLOOR (x, y) for vectors

Reply via email to