Hi honored reviewer, Thanks for the feedback, I hereby submit the new patch:
> > Note: This patch series is working incrementally towards generating the > most > > efficient code for this and other loops in small steps. > > It looks like this could be done in the vectoriser via an extension of the > scalar_cond_masked_set mechanism. We have: > > mask__54.13_59 = vect_a_15.9_55 > vect_b_17.12_58; > vec_mask_and_60 = loop_mask_32 & mask__54.13_59; > … > mask__30.17_67 = vect_a_15.9_55 > vect_cst__66; > mask__29.18_68 = mask__54.13_59 & mask__30.17_67; > vec_mask_and_69 = loop_mask_32 & mask__29.18_68; > > When vectorising mask__29.18_68, we could test whether each side of the > "&" is already in scalar_cond_masked_set and AND in the loop mask if so, like > we do in vectorizable_condition. We could then separately record that the & > result includes the loop mask. When never a mask is being generated from an BIT_AND we mask the operands of the and instead and then just AND the result. This allows us to be able to CSE the masks and generate the right combination. However because re-assoc will try to re-order the masks in the & we have to now perform a small local CSE on the vectorized loop is vectorization is successful. Note: This patch series is working incrementally towards generating the most efficient code for this and other loops in small steps. Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * tree-vectorizer.c (vectorize_loops): Do local CSE through RPVN upon successful vectorization. * tree-vect-stmts.c (prepare_load_store_mask): When combining two masks mask the operands instead of the combined operation. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/pred-combine-and.c: New test. --- inline copy of patch --- diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c new file mode 100644 index 0000000000000000000000000000000000000000..d395b7f84bb15b588493611df5a47549726ac24a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c @@ -0,0 +1,18 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O3 --save-temps" } */ + +void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n) +{ + for (int i = 0; i < n; i++) { + float a = x[i]; + float b = y[i]; + if (a > b) { + z0[i] = a + b; + if (a > c) { + z1[i] = a - b; + } + } + } +} + +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */ diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 4e0b2adf1dc2404bc345af30cfeb9c819084894e..717a25f46aa72534eebeb382c92b9145d7d44d04 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1799,6 +1799,19 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask, return vec_mask; gcc_assert (TREE_TYPE (loop_mask) == mask_type); + + /* Check if the mask is a combination of two different masks. */ + gimple *def_stmt = SSA_NAME_DEF_STMT (vec_mask); + if (is_gimple_assign (def_stmt) + && gimple_assign_rhs_code (def_stmt) == BIT_AND_EXPR) + { + tree lhs1 = gimple_assign_rhs1 (def_stmt); + tree lhs2 = gimple_assign_rhs2 (def_stmt); + + vec_mask = prepare_load_store_mask (mask_type, loop_mask, lhs1, gsi); + loop_mask = prepare_load_store_mask (mask_type, loop_mask, lhs2, gsi); + } + tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and"); gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR, vec_mask, loop_mask); diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 3aa3e2a678328baccc4869fe2c6546e700b92255..84bcd146af7c4dedf6acdd7317954010ad3f281b 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -81,7 +81,7 @@ along with GCC; see the file COPYING3. If not see #include "gimple-pretty-print.h" #include "opt-problem.h" #include "internal-fn.h" - +#include "tree-ssa-sccvn.h" /* Loop or bb location, with hotness information. */ dump_user_location_t vect_location; @@ -1321,6 +1321,27 @@ vectorize_loops (void) ??? Also while we try hard to update loop-closed SSA form we fail to properly do this in some corner-cases (see PR56286). */ rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_only_virtuals); + + for (i = 1; i < number_of_loops (cfun); i++) + { + loop = get_loop (cfun, i); + if (!loop || !single_exit (loop)) + continue; + + bitmap exit_bbs; + /* Perform local CSE, this esp. helps because we emit code for + predicates that need to be shared for optimal predicate usage. + However reassoc will re-order them and prevent CSE from working + as it should. CSE only the loop body, not the entry. */ + exit_bbs = BITMAP_ALLOC (NULL); + bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index); + bitmap_set_bit (exit_bbs, loop->latch->index); + + do_rpo_vn (cfun, loop_preheader_edge (loop), exit_bbs); + + BITMAP_FREE (exit_bbs); + } + return TODO_cleanup_cfg; }
rb14777.patch
Description: rb14777.patch