[PATCH] simplify-rtx: Combine bitwise operations in more cases

2025-04-23 Thread Pengfei Li
This patch transforms RTL expressions of the form (subreg (not X) off)
into (not (subreg X off)) when the subreg is an operand of a bitwise AND
or OR. This transformation can expose opportunities to combine a NOT
operation with the bitwise AND/OR.

For example, it improves the codegen of the following AArch64 NEON
intrinsics:
vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(a)),
  vreinterpretq_s64_s32(b));
from:
not v0.16b, v0.16b
and v0.16b, v0.16b, v1.16b
to:
bic v0.16b, v1.16b, v0.16b

Regression tested on x86_64-linux-gnu, arm-linux-gnueabihf and
aarch64-linux-gnu.

gcc/ChangeLog:

* simplify-rtx.cc (simplify_context::simplify_binary_operation_1):
  Add RTX simplification for bitwise AND/OR.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/simd/bic_orn_1.c: New test.
---
 gcc/simplify-rtx.cc   | 24 +++
 .../gcc.target/aarch64/simd/bic_orn_1.c   | 17 +
 2 files changed, 41 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 88d31a71c05..ed620ef5d45 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -3738,6 +3738,18 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
  && rtx_equal_p (XEXP (XEXP (op0, 0), 0), op1))
return simplify_gen_binary (IOR, mode, XEXP (op0, 1), op1);
 
+  /* Convert (ior (subreg (not X) off) Y) into (ior (not (subreg X off)) Y)
+to expose opportunities to combine IOR and NOT.  */
+  if (GET_CODE (op0) == SUBREG
+ && GET_CODE (SUBREG_REG (op0)) == NOT)
+   {
+ rtx new_subreg = gen_rtx_SUBREG (mode,
+  XEXP (SUBREG_REG (op0), 0),
+  SUBREG_BYTE (op0));
+ rtx new_not = simplify_gen_unary (NOT, mode, new_subreg, mode);
+ return simplify_gen_binary (IOR, mode, new_not, op1);
+   }
+
   tem = simplify_byte_swapping_operation (code, mode, op0, op1);
   if (tem)
return tem;
@@ -4274,6 +4286,18 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP 
(op0, 1));
}
 
+  /* Convert (and (subreg (not X) off) Y) into (and (not (subreg X off)) Y)
+to expose opportunities to combine AND and NOT.  */
+  if (GET_CODE (op0) == SUBREG
+ && GET_CODE (SUBREG_REG (op0)) == NOT)
+   {
+ rtx new_subreg = gen_rtx_SUBREG (mode,
+  XEXP (SUBREG_REG (op0), 0),
+  SUBREG_BYTE (op0));
+ rtx new_not = simplify_gen_unary (NOT, mode, new_subreg, mode);
+ return simplify_gen_binary (AND, mode, new_not, op1);
+   }
+
   tem = simplify_byte_swapping_operation (code, mode, op0, op1);
   if (tem)
return tem;
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c 
b/gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c
new file mode 100644
index 000..1c66f21424e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include 
+
+int64x2_t bic_16b (int32x4_t a, int32x4_t b) {
+  return vandq_s64 (vreinterpretq_s64_s32 (vmvnq_s32 (a)),
+   vreinterpretq_s64_s32 (b));
+}
+
+int16x4_t orn_8b (int32x2_t a, int32x2_t b) {
+  return vorr_s16 (vreinterpret_s16_s32 (a),
+  vreinterpret_s16_s32 (vmvn_s32 (b)));
+}
+
+/* { dg-final { scan-assembler {\tbic\tv[0-9]+\.16b} } } */
+/* { dg-final { scan-assembler {\torn\tv[0-9]+\.8b} } } */
-- 
2.43.0



[PATCH v2] simplify-rtx: Combine bitwise operations in more cases

2025-04-28 Thread Pengfei Li
This patch transforms RTL expressions of the form (subreg (not X)) into
(not (subreg X)) if the subreg is an operand of another binary logical
operation. This transformation can expose opportunities to combine more
logical operations.

For example, it improves the codegen of the following AArch64 NEON
intrinsics:
vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(a)),
  vreinterpretq_s64_s32(b));
from:
not v0.16b, v0.16b
and v0.16b, v0.16b, v1.16b
to:
bic v0.16b, v1.16b, v0.16b

Regression tested on x86_64-linux-gnu, arm-linux-gnueabihf and
aarch64-linux-gnu.

gcc/ChangeLog:

* simplify-rtx.cc (non_paradoxical_subreg_not_p): New function
for pattern match of (subreg (not X)).
(simplify_with_subreg_not): New function for simplification.
---
 gcc/simplify-rtx.cc   | 50 +++
 .../gcc.target/aarch64/simd/bic_orn_1.c   | 17 +++
 2 files changed, 67 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 06b52ca8003..5a6c1a9c039 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -3032,6 +3032,44 @@ match_plus_neg_pattern (rtx op0, rtx op1, machine_mode 
mode)
   return false;
 }
 
+/* Check if OP matches the pattern of (subreg (not X)) and the subreg is
+   non-paradoxical.  */
+
+static bool
+non_paradoxical_subreg_not_p (rtx op)
+{
+  return GET_CODE (op) == SUBREG
+&& !paradoxical_subreg_p (op)
+&& GET_CODE (SUBREG_REG (op)) == NOT;
+}
+
+/* Convert (binop (subreg (not X)) Y) into (binop (not (subreg X)) Y), or
+   (binop X (subreg (not Y))) into (binop X (not (subreg Y))) to expose
+   opportunities to combine another binary logical operation with NOT.  */
+
+static rtx
+simplify_with_subreg_not (rtx_code binop, machine_mode mode, rtx op0, rtx op1)
+{
+  rtx opn = NULL_RTX;
+  if (non_paradoxical_subreg_not_p (op0))
+opn = op0;
+  else if (non_paradoxical_subreg_not_p (op1))
+opn = op1;
+
+  if (opn == NULL_RTX)
+return NULL_RTX;
+
+  rtx new_subreg = simplify_gen_subreg (mode,
+   XEXP (SUBREG_REG (opn), 0),
+   GET_MODE (SUBREG_REG (opn)),
+   SUBREG_BYTE (opn));
+  rtx new_not = simplify_gen_unary (NOT, mode, new_subreg, mode);
+  if (opn == op0)
+return simplify_gen_binary (binop, mode, new_not, op1);
+  else
+return simplify_gen_binary (binop, mode, op0, new_not);
+}
+
 /* Subroutine of simplify_binary_operation.  Simplify a binary operation
CODE with result mode MODE, operating on OP0 and OP1.  If OP0 and/or
OP1 are constant pool references, TRUEOP0 and TRUEOP1 represent the
@@ -3749,6 +3787,10 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
  && rtx_equal_p (XEXP (XEXP (op0, 0), 0), op1))
return simplify_gen_binary (IOR, mode, XEXP (op0, 1), op1);
 
+  tem = simplify_with_subreg_not (code, mode, op0, op1);
+  if (tem)
+   return tem;
+
   tem = simplify_byte_swapping_operation (code, mode, op0, op1);
   if (tem)
return tem;
@@ -4017,6 +4059,10 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
  && rtx_equal_p (XEXP (XEXP (op0, 0), 0), op1))
return simplify_gen_binary (IOR, mode, XEXP (op0, 1), op1);
 
+  tem = simplify_with_subreg_not (code, mode, op0, op1);
+  if (tem)
+   return tem;
+
   tem = simplify_byte_swapping_operation (code, mode, op0, op1);
   if (tem)
return tem;
@@ -4285,6 +4331,10 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP 
(op0, 1));
}
 
+  tem = simplify_with_subreg_not (code, mode, op0, op1);
+  if (tem)
+   return tem;
+
   tem = simplify_byte_swapping_operation (code, mode, op0, op1);
   if (tem)
return tem;
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c 
b/gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c
new file mode 100644
index 000..1c66f21424e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include 
+
+int64x2_t bic_16b (int32x4_t a, int32x4_t b) {
+  return vandq_s64 (vreinterpretq_s64_s32 (vmvnq_s32 (a)),
+   vreinterpretq_s64_s32 (b));
+}
+
+int16x4_t orn_8b (int32x2_t a, int32x2_t b) {
+  return vorr_s16 (vreinterpret_s16_s32 (a),
+  vreinterpret_s16_s32 (vmvn_s32 (b)));
+}
+
+/* { dg-final { scan-assembler {\tbic\tv[0-9]+\.16b} } } */
+/* { dg-final { scan-assembler {\torn\tv[0-9]+\.8b} } } */
-- 
2.43.0



Re: [PATCH] simplify-rtx: Combine bitwise operations in more cases

2025-04-28 Thread Pengfei Li
Thanks Richard for all review comments. I have addressed the comments and sent 
a v2 patch in a new email thread.

--
Thanks,
Pengfei

[PATCH] AArch64: Fold unsigned ADD + LSR by 1 to UHADD

2025-04-28 Thread Pengfei Li
This patch implements the folding of a vector addition followed by a
logical shift right by 1 (add + lsr #1) on AArch64 into an unsigned
halving add, allowing GCC to emit NEON or SVE2 UHADD instructions.

For example, this patch helps improve the codegen from:
add v0.4s, v0.4s, v31.4s
ushrv0.4s, v0.4s, 1
to:
uhadd   v0.4s, v0.4s, v31.4s

For NEON, vector operations are represented using generic mid-end
operations, so new folding rules are added to match.pd. For SVE2, the
operations are represented using built-in GIMPLE calls, so this
optimization is implemented via gimple_folder.

To ensure correctness, additional checks are introduced to guargntee
that the operands to UHADD are vectors in which each element has its top
bit cleared.

This patch has been bootstrapped and regression tested on
x86_64-linux-gnu and aarch64-linux-gnu.

gcc/ChangeLog:

* config/aarch64/aarch64-sve-builtins-base.cc (find_sve_builtin_call):
New helper function for finding and checking a GIMPLE call.
(is_undef): Rewrite with find_sve_builtin_call.
(class svlsr_impl): Implement the folding for SVE2.
(FUNCTION): Check and fold the pattern.
* match.pd: Add new rules to implement the folding for NEON.
* tree.cc (top_bit_zero_vector_p): Add a new utility function for
vector top bit zero check.
* tree.h (top_bit_zero_vector_p): Add a function declaration.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/acle/uhadd_1.c: New test.
* gcc.target/aarch64/sve2/acle/general/uhadd_1.c: New test.
---
 .../aarch64/aarch64-sve-builtins-base.cc  | 101 --
 gcc/match.pd  |   7 ++
 .../gcc.target/aarch64/acle/uhadd_1.c |  34 ++
 .../aarch64/sve2/acle/general/uhadd_1.c   |  30 ++
 gcc/tree.cc   |  30 ++
 gcc/tree.h|   4 +
 6 files changed, 199 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index b4396837c24..ce6da82bf81 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -43,6 +43,7 @@
 #include "aarch64-sve-builtins.h"
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
+#include "aarch64-sve-builtins-sve2.h"
 #include "aarch64-sve-builtins-functions.h"
 #include "aarch64-builtins.h"
 #include "ssa.h"
@@ -53,6 +54,23 @@ using namespace aarch64_sve;
 
 namespace {
 
+/* Return gcall* if VAL is an SSA_NAME defined by the given SVE intrinsics 
call.
+   Otherwise return NULL.  */
+static gcall*
+find_sve_builtin_call (tree val, const function_base *func)
+{
+  if (TREE_CODE (val) == SSA_NAME)
+{
+  gimple *def = SSA_NAME_DEF_STMT (val);
+  if (gcall *call = dyn_cast (def))
+   if (tree fndecl = gimple_call_fndecl (call))
+ if (const function_instance *instance = lookup_fndecl (fndecl))
+   if (instance->base == func)
+ return call;
+}
+  return NULL;
+}
+
 /* Return true if VAL is an undefined value.  */
 static bool
 is_undef (tree val)
@@ -62,12 +80,7 @@ is_undef (tree val)
   if (ssa_undefined_value_p (val, false))
return true;
 
-  gimple *def = SSA_NAME_DEF_STMT (val);
-  if (gcall *call = dyn_cast (def))
-   if (tree fndecl = gimple_call_fndecl (call))
- if (const function_instance *instance = lookup_fndecl (fndecl))
-   if (instance->base == functions::svundef)
- return true;
+  return (find_sve_builtin_call (val, functions::svundef) != NULL);
 }
   return false;
 }
@@ -2088,6 +2101,80 @@ public:
   }
 };
 
+class svlsr_impl : public rtx_code_function
+{
+private:
+  /* Return true if we know active lanes for use in T have top bit zero, where
+ pg_use tells which lanes are active for use.  */
+  bool
+  active_lanes_top_bit_zero_p (tree t, tree pg_use) const
+  {
+/* Return true if T itself is a vector in which each element has top bit
+   zero.  */
+if (top_bit_zero_vector_p (t))
+  return true;
+
+/* Return true if T is an AND op with a vector in which each element has
+   top bit zero.  Note the predicate for AND op should cover active lanes
+   for use.  */
+gcall *and_call = find_sve_builtin_call (t, functions::svand);
+if (and_call != NULL)
+  {
+   tree pg = gimple_call_arg (and_call, 0);
+   if (pg == pg_use || is_ptrue (pg, element_precision (t) / CHAR_BIT))
+ {
+   return top_bit_zero_vector_p (gimple_call_arg (and_call, 1))
+   || top_bit_zero_vector_p (gimple_call_arg (and_call, 2));
+ }
+  }
+
+return false;
+  }
+
+public:
+  CONSTEXPR svlsr_

Re: [PATCH] (not just) AArch64: Fold unsigned ADD + LSR by 1 to UHADD

2025-05-02 Thread Pengfei Li
> Heh.  This is a bit of a hobby-horse of mine.  IMO we should be trying
> to make the generic, target-independent vector operations as useful
> as possible, so that people only need to resort to target-specific
> intrinsics if they're doing something genuinely target-specific.
> At the moment, we have the problem that the intrinsics are being
> used for two very different purposes:
> 
> (1) Let people who know the architecture well write high-level assembly.
> For this use case, the compiler should only interfere with the
>user's instruction selection if the compiler can be sure that
>it's improving things.
>
> (2) Vector intrinsics just express dataflow, with no expectation from the
> user about how the intrinsics will be implemented.  In this use case,
> svand is "&, but for SVE vectors".  The user wants to do an "&",
> looks up the SVE intrinsic for AND, and writes "svand" (or more
> likely, uses a retargetable SIMD framework that does this for them).
> Then the compiler is expected to map svand back to "&" internally.
>
> So yeah, IMO we should encourage users in group (2) to use C/C++
> operators or generic builtins where possible, since it expresses the
> intent better and is far less cumbersome.  And I agree that that's the
> more important case as far as this fold goes.  So personally I'd be
> happy with just that.
> 
> But getting nonzero_bits information out of intrinsics is a legitimate
> use case too.  It's up to you whether you want to go that far.

Thank you for sharing your thought. AFAIK, the initial motivation of this fold 
is to optimize some code pattern in SLEEF and it can be done without the SVE 
built-in part. The SVE built-in part is what I thought could be extended to 
handle more cases in the future. But I'm not sure if there's a real need for it 
now.

So I'm going to split my patch and drop the SVE built-in part at the moment. I 
can re-do it later when either there's a clear need or I've figured out a 
better way to implement it.

--
Thanks,
Pengfei


Re: [PATCH] (not just) AArch64: Fold unsigned ADD + LSR by 1 to UHADD

2025-05-01 Thread Pengfei Li
Thank you for the comments.

> I don't think we can use an unbounded recursive walk, since that
> would become quadratic if we ever used it when optimising one
> AND in a chain of ANDs.  (And using this function for ANDs
> seems plausible.)  Maybe we should be handling the information
> in a similar way to Ranger.

I'm trying to get rid of the recursion by reusing the code in 
get_nonzero_bits().

> Rather than handle the built-in case entirely in target code, how about
> having a target hook into nonzero_element_bits (or whatever replaces it)
> for machine-dependent builtins?

>From the perspective of necessity, do you think it's worth checking the 
>"svand" call inside, or worth handling the whole built-in case? Operations 
>with ACLE SVE types can also be folded as long as we use C/C++ general 
>operators which has been supported in GCC 15.

Thanks,
Pengfei


[PATCH v2] match.pd: Fold (x + y) >> 1 into IFN_AVG_FLOOR (x, y) for vectors

2025-05-08 Thread Pengfei Li
This patch folds vector expressions of the form (x + y) >> 1 into
IFN_AVG_FLOOR (x, y), reducing instruction count on platforms that
support averaging operations. For example, it can help improve the
codegen on AArch64 from:
add v0.4s, v0.4s, v31.4s
ushrv0.4s, v0.4s, 1
to:
uhadd   v0.4s, v0.4s, v31.4s

As this folding is only valid when the most significant bit of each
element in both x and y is known to be zero, this patch checks leading
zero bits of elements in x and y, and extends get_nonzero_bits_1() to
handle uniform vectors. When the input is a uniform vector, the function
now returns the nonzero bits of its element.

Additionally, this patch adds more checks to reject vector types in bit
constant propagation (tree-bit-ccp), since tree-bit-ccp was designed for
scalar values only, and the new vector logic in get_non_zero_bits_1()
could lead to incorrect propagation results.

Bootstrapped and tested on aarch64-linux-gnu and x86_64_linux_gnu.

gcc/ChangeLog:

* match.pd: Add folding rule for vector average.
* tree-ssa-ccp.cc (get_default_value): Reject vector types.
(evaluate_stmt): Reject vector types.
* tree-ssanames.cc (get_nonzero_bits_1): Extend to handle
uniform vectors.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/acle/uhadd_1.c: New test.
---
 gcc/match.pd  |  9 +
 .../gcc.target/aarch64/acle/uhadd_1.c | 34 +++
 gcc/tree-ssa-ccp.cc   |  8 ++---
 gcc/tree-ssanames.cc  |  8 +
 4 files changed, 55 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c

diff --git a/gcc/match.pd b/gcc/match.pd
index ab496d923cc..ddd16a10944 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -2177,6 +2177,15 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 (view_convert (rshift (view_convert:ntype @0) @1))
 (convert (rshift (convert:ntype @0) @1))
 
+ /* Fold ((x + y) >> 1 into IFN_AVG_FLOOR (x, y) if x and y are vectors in
+which each element is known to have at least one leading zero bit.  */
+(simplify
+ (rshift (plus:cs @0 @1) integer_onep)
+ (if (VECTOR_TYPE_P (type)
+  && wi::clz (get_nonzero_bits (@0)) > 0
+  && wi::clz (get_nonzero_bits (@1)) > 0)
+  (IFN_AVG_FLOOR @0 @1)))
+
 /* Try to fold (type) X op CST -> (type) (X op ((type-x) CST))
when profitable.
For bitwise binary operations apply operand conversions to the
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c 
b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
new file mode 100644
index 000..f1748a199ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
@@ -0,0 +1,34 @@
+/* Test if SIMD fused unsigned halving adds are generated */
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include 
+
+#define FUSED_SIMD_UHADD(vectype, q, ts, mask) \
+  vectype simd_uhadd ## q ## _ ## ts ## _1 (vectype a) \
+  { \
+vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \
+vectype v2 = vdup ## q ## _n_ ## ts (mask); \
+return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \
+  } \
+  \
+  vectype simd_uhadd ## q ## _ ## ts ## _2 (vectype a, vectype b) \
+  { \
+vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \
+vectype v2 = vand ## q ## _ ## ts (b, vdup ## q ## _n_ ## ts (mask)); \
+return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \
+  }
+
+FUSED_SIMD_UHADD (uint8x8_t, , u8, 0x7f)
+FUSED_SIMD_UHADD (uint8x16_t, q, u8, 0x7f)
+FUSED_SIMD_UHADD (uint16x4_t, , u16, 0x7fff)
+FUSED_SIMD_UHADD (uint16x8_t, q, u16, 0x7fff)
+FUSED_SIMD_UHADD (uint32x2_t, , u32, 0x7fff)
+FUSED_SIMD_UHADD (uint32x4_t, q, u32, 0x7fff)
+
+/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8b,} 2 } } */
+/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.16b,} 2 } } */
+/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.2s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4s,} 2 } } */
diff --git a/gcc/tree-ssa-ccp.cc b/gcc/tree-ssa-ccp.cc
index 8d2cbb384c4..3e0c75cf2be 100644
--- a/gcc/tree-ssa-ccp.cc
+++ b/gcc/tree-ssa-ccp.cc
@@ -298,7 +298,7 @@ get_default_value (tree var)
{
  val.lattice_val = VARYING;
  val.mask = -1;
- if (flag_tree_bit_ccp)
+ if (flag_tree_bit_ccp && !VECTOR_TYPE_P (TREE_TYPE (var)))
{
  wide_int nonzero_bits = get_nonzero_bits (var);
  tree value;
@@ -2491,11 +2491,11 @@ evaluate_stmt (gimple *stmt)
   is_constant = (val.lattice_val == CONSTANT);
 }
 
+  tree lhs = gimple_get_lhs (stmt);
   if (flag_tree_bit_ccp
+  && lhs && TREE_CODE (lhs) == SSA_NAME && !VECTOR_TYPE_P (TREE_TYPE (lhs))
   && ((is_constant && TREE_C

[PATCH] vect: Improve vectorization for small-trip-count loops using subvectors

2025-05-08 Thread Pengfei Li
This patch improves the auto-vectorization for loops with known small
trip counts by enabling the use of subvectors - bit fields of original
wider vectors. A subvector must have the same vector element type as the
original vector and enough bits for all vector elements to be processed
in the loop. Using subvectors is beneficial because machine instructions
operating on narrower vectors usually show better performance.

To enable this optimization, this patch introduces a new target hook.
This hook allows the vectorizer to query the backend for a suitable
subvector type given the original vector type and the number of elements
to be processed in the small-trip-count loop. The target hook also has a
could_trap parameter to say if the subvector is allowed to have more
bits than needed.

This optimization is currently enabled for AArch64 only. Below example
shows how it uses AdvSIMD vectors as subvectors of SVE vectors for
higher instruction throughput.

Consider this loop operating on an array of 16-bit integers:

for (int i = 0; i < 5; i++) {
  a[i] = a[i] < 0 ? -a[i] : a[i];
}

Before this patch, the generated AArch64 code would be:

ptrue   p7.h, vl5
ptrue   p6.b, all
ld1hz31.h, p7/z, [x0]
abs z31.h, p6/m, z31.h
st1hz31.h, p7, [x0]

After this patch, it is optimized to:

ptrue   p7.h, vl5
ld1hz31.h, p7/z, [x0]
abs v31.8h, v31.8h
st1hz31.h, p7, [x0]

This patch also helps eliminate the ptrue in the case.

Bootstrapped and tested on aarch64-linux-gnu and x86_64-linux-gnu.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_find_subvector_type):
Implement target hook for finding subvectors for AArch64.
* doc/tm.texi: Document the new target hook.
* doc/tm.texi.in: Document the new target hook.
* expmed.cc (extract_bit_field_as_subreg): Support expanding
BIT_FIELD_REF for subvector types to SUBREG in RTL.
* match.pd: Prevent simplification of BIT_FIELD_REF for
subvector types to VIEW_CONVERT.
* target.def: New target hook definition.
* targhooks.cc (default_vectorize_find_subvector_type): Provide
default implementation for the target hook.
* tree-cfg.cc (verify_types_in_gimple_reference): Update GIMPLE
verification for BIT_FIELD_REF used for subvectors.
* tree-vect-stmts.cc (vectorizable_operation): Output vectorized
GIMPLE with subvector types.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/cond_unary_6.c: Adjust loop trip counts
to avoid triggering this new optimization.
* gcc.target/aarch64/vect-subvector-1.c: New test.
* gcc.target/aarch64/vect-subvector-2.c: New test.
---
 gcc/config/aarch64/aarch64.cc | 39 
 gcc/doc/tm.texi   | 12 +++
 gcc/doc/tm.texi.in|  2 +
 gcc/expmed.cc |  5 +-
 gcc/match.pd  |  3 +-
 gcc/target.def| 17 
 gcc/targhooks.cc  |  8 ++
 gcc/targhooks.h   |  3 +
 .../gcc.target/aarch64/sve/cond_unary_6.c |  4 +-
 .../gcc.target/aarch64/vect-subvector-1.c | 28 ++
 .../gcc.target/aarch64/vect-subvector-2.c | 28 ++
 gcc/tree-cfg.cc   |  8 ++
 gcc/tree-vect-stmts.cc| 90 ++-
 13 files changed, 240 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-subvector-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-subvector-2.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index fff8d9da49d..700f1646706 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17012,6 +17012,42 @@ aarch64_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
 }
 }
 
+/* Implement TARGET_VECTORIZE_FIND_SUBVECTOR_TYPE.  */
+static tree
+aarch64_find_subvector_type (tree vectype, unsigned HOST_WIDE_INT elem_cnt,
+bool could_trap)
+{
+  gcc_assert (VECTOR_TYPE_P (vectype));
+
+  /* AArch64 AdvSIMD vectors are treated as subvectors of SVE for all
+ vectorization preferences except "sve-only".  */
+  if (aarch64_autovec_preference == AARCH64_AUTOVEC_SVE_ONLY)
+return NULL_TREE;
+
+  /* No subvectors for AdvSIMD or partial vectors, since elements in partial
+ vectors could be non-consecutive.  */
+  machine_mode mode = TYPE_MODE (vectype);
+  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+  if ((vec_flags & VEC_ADVSIMD) || (vec_flags & VEC_PARTIAL))
+return NULL_TREE;
+
+  tree innertype = TREE_TYPE (vectype);
+  unsigned int scalar_prec = TYPE_PRECISION (innertype);
+  unsigned int data_bits = elem_cnt * scalar_prec;
+
+  /* If the operation could trap, w