[PATCH] simplify-rtx: Combine bitwise operations in more cases
This patch transforms RTL expressions of the form (subreg (not X) off) into (not (subreg X off)) when the subreg is an operand of a bitwise AND or OR. This transformation can expose opportunities to combine a NOT operation with the bitwise AND/OR. For example, it improves the codegen of the following AArch64 NEON intrinsics: vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(a)), vreinterpretq_s64_s32(b)); from: not v0.16b, v0.16b and v0.16b, v0.16b, v1.16b to: bic v0.16b, v1.16b, v0.16b Regression tested on x86_64-linux-gnu, arm-linux-gnueabihf and aarch64-linux-gnu. gcc/ChangeLog: * simplify-rtx.cc (simplify_context::simplify_binary_operation_1): Add RTX simplification for bitwise AND/OR. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/bic_orn_1.c: New test. --- gcc/simplify-rtx.cc | 24 +++ .../gcc.target/aarch64/simd/bic_orn_1.c | 17 + 2 files changed, 41 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index 88d31a71c05..ed620ef5d45 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -3738,6 +3738,18 @@ simplify_context::simplify_binary_operation_1 (rtx_code code, && rtx_equal_p (XEXP (XEXP (op0, 0), 0), op1)) return simplify_gen_binary (IOR, mode, XEXP (op0, 1), op1); + /* Convert (ior (subreg (not X) off) Y) into (ior (not (subreg X off)) Y) +to expose opportunities to combine IOR and NOT. */ + if (GET_CODE (op0) == SUBREG + && GET_CODE (SUBREG_REG (op0)) == NOT) + { + rtx new_subreg = gen_rtx_SUBREG (mode, + XEXP (SUBREG_REG (op0), 0), + SUBREG_BYTE (op0)); + rtx new_not = simplify_gen_unary (NOT, mode, new_subreg, mode); + return simplify_gen_binary (IOR, mode, new_not, op1); + } + tem = simplify_byte_swapping_operation (code, mode, op0, op1); if (tem) return tem; @@ -4274,6 +4286,18 @@ simplify_context::simplify_binary_operation_1 (rtx_code code, return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP (op0, 1)); } + /* Convert (and (subreg (not X) off) Y) into (and (not (subreg X off)) Y) +to expose opportunities to combine AND and NOT. */ + if (GET_CODE (op0) == SUBREG + && GET_CODE (SUBREG_REG (op0)) == NOT) + { + rtx new_subreg = gen_rtx_SUBREG (mode, + XEXP (SUBREG_REG (op0), 0), + SUBREG_BYTE (op0)); + rtx new_not = simplify_gen_unary (NOT, mode, new_subreg, mode); + return simplify_gen_binary (AND, mode, new_not, op1); + } + tem = simplify_byte_swapping_operation (code, mode, op0, op1); if (tem) return tem; diff --git a/gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c b/gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c new file mode 100644 index 000..1c66f21424e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include + +int64x2_t bic_16b (int32x4_t a, int32x4_t b) { + return vandq_s64 (vreinterpretq_s64_s32 (vmvnq_s32 (a)), + vreinterpretq_s64_s32 (b)); +} + +int16x4_t orn_8b (int32x2_t a, int32x2_t b) { + return vorr_s16 (vreinterpret_s16_s32 (a), + vreinterpret_s16_s32 (vmvn_s32 (b))); +} + +/* { dg-final { scan-assembler {\tbic\tv[0-9]+\.16b} } } */ +/* { dg-final { scan-assembler {\torn\tv[0-9]+\.8b} } } */ -- 2.43.0
[PATCH v2] simplify-rtx: Combine bitwise operations in more cases
This patch transforms RTL expressions of the form (subreg (not X)) into (not (subreg X)) if the subreg is an operand of another binary logical operation. This transformation can expose opportunities to combine more logical operations. For example, it improves the codegen of the following AArch64 NEON intrinsics: vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(a)), vreinterpretq_s64_s32(b)); from: not v0.16b, v0.16b and v0.16b, v0.16b, v1.16b to: bic v0.16b, v1.16b, v0.16b Regression tested on x86_64-linux-gnu, arm-linux-gnueabihf and aarch64-linux-gnu. gcc/ChangeLog: * simplify-rtx.cc (non_paradoxical_subreg_not_p): New function for pattern match of (subreg (not X)). (simplify_with_subreg_not): New function for simplification. --- gcc/simplify-rtx.cc | 50 +++ .../gcc.target/aarch64/simd/bic_orn_1.c | 17 +++ 2 files changed, 67 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index 06b52ca8003..5a6c1a9c039 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -3032,6 +3032,44 @@ match_plus_neg_pattern (rtx op0, rtx op1, machine_mode mode) return false; } +/* Check if OP matches the pattern of (subreg (not X)) and the subreg is + non-paradoxical. */ + +static bool +non_paradoxical_subreg_not_p (rtx op) +{ + return GET_CODE (op) == SUBREG +&& !paradoxical_subreg_p (op) +&& GET_CODE (SUBREG_REG (op)) == NOT; +} + +/* Convert (binop (subreg (not X)) Y) into (binop (not (subreg X)) Y), or + (binop X (subreg (not Y))) into (binop X (not (subreg Y))) to expose + opportunities to combine another binary logical operation with NOT. */ + +static rtx +simplify_with_subreg_not (rtx_code binop, machine_mode mode, rtx op0, rtx op1) +{ + rtx opn = NULL_RTX; + if (non_paradoxical_subreg_not_p (op0)) +opn = op0; + else if (non_paradoxical_subreg_not_p (op1)) +opn = op1; + + if (opn == NULL_RTX) +return NULL_RTX; + + rtx new_subreg = simplify_gen_subreg (mode, + XEXP (SUBREG_REG (opn), 0), + GET_MODE (SUBREG_REG (opn)), + SUBREG_BYTE (opn)); + rtx new_not = simplify_gen_unary (NOT, mode, new_subreg, mode); + if (opn == op0) +return simplify_gen_binary (binop, mode, new_not, op1); + else +return simplify_gen_binary (binop, mode, op0, new_not); +} + /* Subroutine of simplify_binary_operation. Simplify a binary operation CODE with result mode MODE, operating on OP0 and OP1. If OP0 and/or OP1 are constant pool references, TRUEOP0 and TRUEOP1 represent the @@ -3749,6 +3787,10 @@ simplify_context::simplify_binary_operation_1 (rtx_code code, && rtx_equal_p (XEXP (XEXP (op0, 0), 0), op1)) return simplify_gen_binary (IOR, mode, XEXP (op0, 1), op1); + tem = simplify_with_subreg_not (code, mode, op0, op1); + if (tem) + return tem; + tem = simplify_byte_swapping_operation (code, mode, op0, op1); if (tem) return tem; @@ -4017,6 +4059,10 @@ simplify_context::simplify_binary_operation_1 (rtx_code code, && rtx_equal_p (XEXP (XEXP (op0, 0), 0), op1)) return simplify_gen_binary (IOR, mode, XEXP (op0, 1), op1); + tem = simplify_with_subreg_not (code, mode, op0, op1); + if (tem) + return tem; + tem = simplify_byte_swapping_operation (code, mode, op0, op1); if (tem) return tem; @@ -4285,6 +4331,10 @@ simplify_context::simplify_binary_operation_1 (rtx_code code, return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP (op0, 1)); } + tem = simplify_with_subreg_not (code, mode, op0, op1); + if (tem) + return tem; + tem = simplify_byte_swapping_operation (code, mode, op0, op1); if (tem) return tem; diff --git a/gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c b/gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c new file mode 100644 index 000..1c66f21424e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/bic_orn_1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include + +int64x2_t bic_16b (int32x4_t a, int32x4_t b) { + return vandq_s64 (vreinterpretq_s64_s32 (vmvnq_s32 (a)), + vreinterpretq_s64_s32 (b)); +} + +int16x4_t orn_8b (int32x2_t a, int32x2_t b) { + return vorr_s16 (vreinterpret_s16_s32 (a), + vreinterpret_s16_s32 (vmvn_s32 (b))); +} + +/* { dg-final { scan-assembler {\tbic\tv[0-9]+\.16b} } } */ +/* { dg-final { scan-assembler {\torn\tv[0-9]+\.8b} } } */ -- 2.43.0
Re: [PATCH] simplify-rtx: Combine bitwise operations in more cases
Thanks Richard for all review comments. I have addressed the comments and sent a v2 patch in a new email thread. -- Thanks, Pengfei
[PATCH] AArch64: Fold unsigned ADD + LSR by 1 to UHADD
This patch implements the folding of a vector addition followed by a logical shift right by 1 (add + lsr #1) on AArch64 into an unsigned halving add, allowing GCC to emit NEON or SVE2 UHADD instructions. For example, this patch helps improve the codegen from: add v0.4s, v0.4s, v31.4s ushrv0.4s, v0.4s, 1 to: uhadd v0.4s, v0.4s, v31.4s For NEON, vector operations are represented using generic mid-end operations, so new folding rules are added to match.pd. For SVE2, the operations are represented using built-in GIMPLE calls, so this optimization is implemented via gimple_folder. To ensure correctness, additional checks are introduced to guargntee that the operands to UHADD are vectors in which each element has its top bit cleared. This patch has been bootstrapped and regression tested on x86_64-linux-gnu and aarch64-linux-gnu. gcc/ChangeLog: * config/aarch64/aarch64-sve-builtins-base.cc (find_sve_builtin_call): New helper function for finding and checking a GIMPLE call. (is_undef): Rewrite with find_sve_builtin_call. (class svlsr_impl): Implement the folding for SVE2. (FUNCTION): Check and fold the pattern. * match.pd: Add new rules to implement the folding for NEON. * tree.cc (top_bit_zero_vector_p): Add a new utility function for vector top bit zero check. * tree.h (top_bit_zero_vector_p): Add a function declaration. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/uhadd_1.c: New test. * gcc.target/aarch64/sve2/acle/general/uhadd_1.c: New test. --- .../aarch64/aarch64-sve-builtins-base.cc | 101 -- gcc/match.pd | 7 ++ .../gcc.target/aarch64/acle/uhadd_1.c | 34 ++ .../aarch64/sve2/acle/general/uhadd_1.c | 30 ++ gcc/tree.cc | 30 ++ gcc/tree.h| 4 + 6 files changed, 199 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index b4396837c24..ce6da82bf81 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -43,6 +43,7 @@ #include "aarch64-sve-builtins.h" #include "aarch64-sve-builtins-shapes.h" #include "aarch64-sve-builtins-base.h" +#include "aarch64-sve-builtins-sve2.h" #include "aarch64-sve-builtins-functions.h" #include "aarch64-builtins.h" #include "ssa.h" @@ -53,6 +54,23 @@ using namespace aarch64_sve; namespace { +/* Return gcall* if VAL is an SSA_NAME defined by the given SVE intrinsics call. + Otherwise return NULL. */ +static gcall* +find_sve_builtin_call (tree val, const function_base *func) +{ + if (TREE_CODE (val) == SSA_NAME) +{ + gimple *def = SSA_NAME_DEF_STMT (val); + if (gcall *call = dyn_cast (def)) + if (tree fndecl = gimple_call_fndecl (call)) + if (const function_instance *instance = lookup_fndecl (fndecl)) + if (instance->base == func) + return call; +} + return NULL; +} + /* Return true if VAL is an undefined value. */ static bool is_undef (tree val) @@ -62,12 +80,7 @@ is_undef (tree val) if (ssa_undefined_value_p (val, false)) return true; - gimple *def = SSA_NAME_DEF_STMT (val); - if (gcall *call = dyn_cast (def)) - if (tree fndecl = gimple_call_fndecl (call)) - if (const function_instance *instance = lookup_fndecl (fndecl)) - if (instance->base == functions::svundef) - return true; + return (find_sve_builtin_call (val, functions::svundef) != NULL); } return false; } @@ -2088,6 +2101,80 @@ public: } }; +class svlsr_impl : public rtx_code_function +{ +private: + /* Return true if we know active lanes for use in T have top bit zero, where + pg_use tells which lanes are active for use. */ + bool + active_lanes_top_bit_zero_p (tree t, tree pg_use) const + { +/* Return true if T itself is a vector in which each element has top bit + zero. */ +if (top_bit_zero_vector_p (t)) + return true; + +/* Return true if T is an AND op with a vector in which each element has + top bit zero. Note the predicate for AND op should cover active lanes + for use. */ +gcall *and_call = find_sve_builtin_call (t, functions::svand); +if (and_call != NULL) + { + tree pg = gimple_call_arg (and_call, 0); + if (pg == pg_use || is_ptrue (pg, element_precision (t) / CHAR_BIT)) + { + return top_bit_zero_vector_p (gimple_call_arg (and_call, 1)) + || top_bit_zero_vector_p (gimple_call_arg (and_call, 2)); + } + } + +return false; + } + +public: + CONSTEXPR svlsr_
Re: [PATCH] (not just) AArch64: Fold unsigned ADD + LSR by 1 to UHADD
> Heh. This is a bit of a hobby-horse of mine. IMO we should be trying > to make the generic, target-independent vector operations as useful > as possible, so that people only need to resort to target-specific > intrinsics if they're doing something genuinely target-specific. > At the moment, we have the problem that the intrinsics are being > used for two very different purposes: > > (1) Let people who know the architecture well write high-level assembly. > For this use case, the compiler should only interfere with the >user's instruction selection if the compiler can be sure that >it's improving things. > > (2) Vector intrinsics just express dataflow, with no expectation from the > user about how the intrinsics will be implemented. In this use case, > svand is "&, but for SVE vectors". The user wants to do an "&", > looks up the SVE intrinsic for AND, and writes "svand" (or more > likely, uses a retargetable SIMD framework that does this for them). > Then the compiler is expected to map svand back to "&" internally. > > So yeah, IMO we should encourage users in group (2) to use C/C++ > operators or generic builtins where possible, since it expresses the > intent better and is far less cumbersome. And I agree that that's the > more important case as far as this fold goes. So personally I'd be > happy with just that. > > But getting nonzero_bits information out of intrinsics is a legitimate > use case too. It's up to you whether you want to go that far. Thank you for sharing your thought. AFAIK, the initial motivation of this fold is to optimize some code pattern in SLEEF and it can be done without the SVE built-in part. The SVE built-in part is what I thought could be extended to handle more cases in the future. But I'm not sure if there's a real need for it now. So I'm going to split my patch and drop the SVE built-in part at the moment. I can re-do it later when either there's a clear need or I've figured out a better way to implement it. -- Thanks, Pengfei
Re: [PATCH] (not just) AArch64: Fold unsigned ADD + LSR by 1 to UHADD
Thank you for the comments. > I don't think we can use an unbounded recursive walk, since that > would become quadratic if we ever used it when optimising one > AND in a chain of ANDs. (And using this function for ANDs > seems plausible.) Maybe we should be handling the information > in a similar way to Ranger. I'm trying to get rid of the recursion by reusing the code in get_nonzero_bits(). > Rather than handle the built-in case entirely in target code, how about > having a target hook into nonzero_element_bits (or whatever replaces it) > for machine-dependent builtins? >From the perspective of necessity, do you think it's worth checking the >"svand" call inside, or worth handling the whole built-in case? Operations >with ACLE SVE types can also be folded as long as we use C/C++ general >operators which has been supported in GCC 15. Thanks, Pengfei
[PATCH v2] match.pd: Fold (x + y) >> 1 into IFN_AVG_FLOOR (x, y) for vectors
This patch folds vector expressions of the form (x + y) >> 1 into IFN_AVG_FLOOR (x, y), reducing instruction count on platforms that support averaging operations. For example, it can help improve the codegen on AArch64 from: add v0.4s, v0.4s, v31.4s ushrv0.4s, v0.4s, 1 to: uhadd v0.4s, v0.4s, v31.4s As this folding is only valid when the most significant bit of each element in both x and y is known to be zero, this patch checks leading zero bits of elements in x and y, and extends get_nonzero_bits_1() to handle uniform vectors. When the input is a uniform vector, the function now returns the nonzero bits of its element. Additionally, this patch adds more checks to reject vector types in bit constant propagation (tree-bit-ccp), since tree-bit-ccp was designed for scalar values only, and the new vector logic in get_non_zero_bits_1() could lead to incorrect propagation results. Bootstrapped and tested on aarch64-linux-gnu and x86_64_linux_gnu. gcc/ChangeLog: * match.pd: Add folding rule for vector average. * tree-ssa-ccp.cc (get_default_value): Reject vector types. (evaluate_stmt): Reject vector types. * tree-ssanames.cc (get_nonzero_bits_1): Extend to handle uniform vectors. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/uhadd_1.c: New test. --- gcc/match.pd | 9 + .../gcc.target/aarch64/acle/uhadd_1.c | 34 +++ gcc/tree-ssa-ccp.cc | 8 ++--- gcc/tree-ssanames.cc | 8 + 4 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c diff --git a/gcc/match.pd b/gcc/match.pd index ab496d923cc..ddd16a10944 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -2177,6 +2177,15 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (view_convert (rshift (view_convert:ntype @0) @1)) (convert (rshift (convert:ntype @0) @1)) + /* Fold ((x + y) >> 1 into IFN_AVG_FLOOR (x, y) if x and y are vectors in +which each element is known to have at least one leading zero bit. */ +(simplify + (rshift (plus:cs @0 @1) integer_onep) + (if (VECTOR_TYPE_P (type) + && wi::clz (get_nonzero_bits (@0)) > 0 + && wi::clz (get_nonzero_bits (@1)) > 0) + (IFN_AVG_FLOOR @0 @1))) + /* Try to fold (type) X op CST -> (type) (X op ((type-x) CST)) when profitable. For bitwise binary operations apply operand conversions to the diff --git a/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c new file mode 100644 index 000..f1748a199ad --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c @@ -0,0 +1,34 @@ +/* Test if SIMD fused unsigned halving adds are generated */ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include + +#define FUSED_SIMD_UHADD(vectype, q, ts, mask) \ + vectype simd_uhadd ## q ## _ ## ts ## _1 (vectype a) \ + { \ +vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \ +vectype v2 = vdup ## q ## _n_ ## ts (mask); \ +return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \ + } \ + \ + vectype simd_uhadd ## q ## _ ## ts ## _2 (vectype a, vectype b) \ + { \ +vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \ +vectype v2 = vand ## q ## _ ## ts (b, vdup ## q ## _n_ ## ts (mask)); \ +return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \ + } + +FUSED_SIMD_UHADD (uint8x8_t, , u8, 0x7f) +FUSED_SIMD_UHADD (uint8x16_t, q, u8, 0x7f) +FUSED_SIMD_UHADD (uint16x4_t, , u16, 0x7fff) +FUSED_SIMD_UHADD (uint16x8_t, q, u16, 0x7fff) +FUSED_SIMD_UHADD (uint32x2_t, , u32, 0x7fff) +FUSED_SIMD_UHADD (uint32x4_t, q, u32, 0x7fff) + +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8b,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.16b,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4h,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8h,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.2s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4s,} 2 } } */ diff --git a/gcc/tree-ssa-ccp.cc b/gcc/tree-ssa-ccp.cc index 8d2cbb384c4..3e0c75cf2be 100644 --- a/gcc/tree-ssa-ccp.cc +++ b/gcc/tree-ssa-ccp.cc @@ -298,7 +298,7 @@ get_default_value (tree var) { val.lattice_val = VARYING; val.mask = -1; - if (flag_tree_bit_ccp) + if (flag_tree_bit_ccp && !VECTOR_TYPE_P (TREE_TYPE (var))) { wide_int nonzero_bits = get_nonzero_bits (var); tree value; @@ -2491,11 +2491,11 @@ evaluate_stmt (gimple *stmt) is_constant = (val.lattice_val == CONSTANT); } + tree lhs = gimple_get_lhs (stmt); if (flag_tree_bit_ccp + && lhs && TREE_CODE (lhs) == SSA_NAME && !VECTOR_TYPE_P (TREE_TYPE (lhs)) && ((is_constant && TREE_C
[PATCH] vect: Improve vectorization for small-trip-count loops using subvectors
This patch improves the auto-vectorization for loops with known small trip counts by enabling the use of subvectors - bit fields of original wider vectors. A subvector must have the same vector element type as the original vector and enough bits for all vector elements to be processed in the loop. Using subvectors is beneficial because machine instructions operating on narrower vectors usually show better performance. To enable this optimization, this patch introduces a new target hook. This hook allows the vectorizer to query the backend for a suitable subvector type given the original vector type and the number of elements to be processed in the small-trip-count loop. The target hook also has a could_trap parameter to say if the subvector is allowed to have more bits than needed. This optimization is currently enabled for AArch64 only. Below example shows how it uses AdvSIMD vectors as subvectors of SVE vectors for higher instruction throughput. Consider this loop operating on an array of 16-bit integers: for (int i = 0; i < 5; i++) { a[i] = a[i] < 0 ? -a[i] : a[i]; } Before this patch, the generated AArch64 code would be: ptrue p7.h, vl5 ptrue p6.b, all ld1hz31.h, p7/z, [x0] abs z31.h, p6/m, z31.h st1hz31.h, p7, [x0] After this patch, it is optimized to: ptrue p7.h, vl5 ld1hz31.h, p7/z, [x0] abs v31.8h, v31.8h st1hz31.h, p7, [x0] This patch also helps eliminate the ptrue in the case. Bootstrapped and tested on aarch64-linux-gnu and x86_64-linux-gnu. gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_find_subvector_type): Implement target hook for finding subvectors for AArch64. * doc/tm.texi: Document the new target hook. * doc/tm.texi.in: Document the new target hook. * expmed.cc (extract_bit_field_as_subreg): Support expanding BIT_FIELD_REF for subvector types to SUBREG in RTL. * match.pd: Prevent simplification of BIT_FIELD_REF for subvector types to VIEW_CONVERT. * target.def: New target hook definition. * targhooks.cc (default_vectorize_find_subvector_type): Provide default implementation for the target hook. * tree-cfg.cc (verify_types_in_gimple_reference): Update GIMPLE verification for BIT_FIELD_REF used for subvectors. * tree-vect-stmts.cc (vectorizable_operation): Output vectorized GIMPLE with subvector types. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/cond_unary_6.c: Adjust loop trip counts to avoid triggering this new optimization. * gcc.target/aarch64/vect-subvector-1.c: New test. * gcc.target/aarch64/vect-subvector-2.c: New test. --- gcc/config/aarch64/aarch64.cc | 39 gcc/doc/tm.texi | 12 +++ gcc/doc/tm.texi.in| 2 + gcc/expmed.cc | 5 +- gcc/match.pd | 3 +- gcc/target.def| 17 gcc/targhooks.cc | 8 ++ gcc/targhooks.h | 3 + .../gcc.target/aarch64/sve/cond_unary_6.c | 4 +- .../gcc.target/aarch64/vect-subvector-1.c | 28 ++ .../gcc.target/aarch64/vect-subvector-2.c | 28 ++ gcc/tree-cfg.cc | 8 ++ gcc/tree-vect-stmts.cc| 90 ++- 13 files changed, 240 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-subvector-1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-subvector-2.c diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index fff8d9da49d..700f1646706 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -17012,6 +17012,42 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, } } +/* Implement TARGET_VECTORIZE_FIND_SUBVECTOR_TYPE. */ +static tree +aarch64_find_subvector_type (tree vectype, unsigned HOST_WIDE_INT elem_cnt, +bool could_trap) +{ + gcc_assert (VECTOR_TYPE_P (vectype)); + + /* AArch64 AdvSIMD vectors are treated as subvectors of SVE for all + vectorization preferences except "sve-only". */ + if (aarch64_autovec_preference == AARCH64_AUTOVEC_SVE_ONLY) +return NULL_TREE; + + /* No subvectors for AdvSIMD or partial vectors, since elements in partial + vectors could be non-consecutive. */ + machine_mode mode = TYPE_MODE (vectype); + unsigned int vec_flags = aarch64_classify_vector_mode (mode); + if ((vec_flags & VEC_ADVSIMD) || (vec_flags & VEC_PARTIAL)) +return NULL_TREE; + + tree innertype = TREE_TYPE (vectype); + unsigned int scalar_prec = TYPE_PRECISION (innertype); + unsigned int data_bits = elem_cnt * scalar_prec; + + /* If the operation could trap, w